use of org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param work The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @param fileSystem FS corresponding to scratchDir and LocalResources
* @param ctx This query's context
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
Vertex v = null;
// BaseWork.
if (work instanceof MapWork) {
v = createVertex(conf, (MapWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
} else if (work instanceof ReduceWork) {
v = createVertex(conf, (ReduceWork) work, fileSystem, scratchDir, ctx, localResources);
} else if (work instanceof MergeJoinWork) {
v = createVertex(conf, (MergeJoinWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
// set VertexManagerPlugin if whether it's a cross product destination vertex
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(work)) {
if (tezWork.getEdgeType(parentWork, work) == EdgeType.XPROD_EDGE) {
crossProductSources.add(parentWork.getName());
}
}
if (!crossProductSources.isEmpty()) {
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
v.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
// parallelism shouldn't be set for cartesian product vertex
}
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
// initialize stats publisher if necessary
if (work.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
// final vertices need to have at least one output
if (!hasChildren) {
v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
}
return v;
}
use of org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig in project tez by apache.
the class TestFaultTolerance method testCartesianProduct.
/**
* In unpartitioned cartesian product, failure fraction should be #unique failure/#consumer that
* depends on the src task. Here we test a 2x2 cartesian product and let 4th destination task fail.
* The failure fraction limit is configured to be 0.25. So the failure fraction should be 1/2,
* not 1/4.
* @throws Exception
*/
@Test
public void testCartesianProduct() throws Exception {
Configuration dagConf = new Configuration();
dagConf.setDouble(TezConfiguration.TEZ_TASK_MAX_ALLOWED_OUTPUT_FAILURES_FRACTION, 0.25);
DAG dag = DAG.create("dag");
Configuration vertexConf = new Configuration();
vertexConf.setInt(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_VERIFY_TASK_INDEX, "v3"), 3);
vertexConf.setInt(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_VERIFY_VALUE, "v3"), 5);
UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(vertexConf);
ProcessorDescriptor processorDescriptor = ProcessorDescriptor.create(TestProcessor.class.getName()).setUserPayload(vertexPayload);
Vertex v1 = Vertex.create("v1", processorDescriptor, 2);
Vertex v2 = Vertex.create("v2", processorDescriptor, 2);
Vertex v3 = Vertex.create("v3", processorDescriptor);
String[] sourceVertices = { "v1", "v2" };
CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
TezConfiguration tezConf = new TezConfiguration();
tezConf.setInt(CartesianProductVertexManager.TEZ_CARTESIAN_PRODUCT_NUM_PARTITIONS, 1);
tezConf.setBoolean(CartesianProductVertexManager.TEZ_CARTESIAN_PRODUCT_ENABLE_GROUPING, false);
UserPayload cartesianProductPayload = cartesianProductConfig.toUserPayload(tezConf);
v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cartesianProductPayload));
EdgeManagerPluginDescriptor edgeManagerPluginDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()).setUserPayload(cartesianProductPayload);
Configuration inputConf = new Configuration();
inputConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), 3);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), 0);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), 0);
inputConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"), 0);
UserPayload inputPayload = TezUtils.createUserPayloadFromConf(inputConf);
EdgeProperty edgeProperty = EdgeProperty.create(edgeManagerPluginDescriptor, DataMovementType.CUSTOM, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, TestOutput.getOutputDesc(null), TestInput.getInputDesc(inputPayload));
Edge e1 = Edge.create(v1, v3, edgeProperty);
Edge e2 = Edge.create(v2, v3, edgeProperty);
dag.addVertex(v1).addVertex(v2).addVertex(v3);
dag.addEdge(e1).addEdge(e2);
// run dag
runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED);
}
use of org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig in project tez by apache.
the class CartesianProduct method createDAG.
private DAG createDAG(TezConfiguration tezConf) throws IOException {
InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v1.addDataSource(INPUT, dataSourceDescriptor);
Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v2.addDataSource(INPUT, dataSourceDescriptor);
OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
v3.addDataSink(OUTPUT, dataSinkDescriptor);
v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
edgeManagerDescriptor.setUserPayload(userPayload);
UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
use of org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig in project hive by apache.
the class DagUtils method createEdgeProperty.
/*
* Helper function to create an edge property from an edge type.
*/
private EdgeProperty createEdgeProperty(Vertex w, TezEdgeProperty edgeProp, Configuration conf, BaseWork work, TezWork tezWork) throws IOException {
MRHelpers.translateMRConfToTez(conf);
String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
String partitionerClassName = conf.get("mapred.partitioner.class");
Map<String, String> partitionerConf;
EdgeType edgeType = edgeProp.getEdgeType();
switch(edgeType) {
case BROADCAST_EDGE:
UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et1Conf.createDefaultBroadcastEdgeProperty();
case CUSTOM_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
DataOutputBuffer dob = new DataOutputBuffer();
edgeConf.write(dob);
byte[] userPayload = dob.getData();
edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
case CUSTOM_SIMPLE_EDGE:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
UnorderedPartitionedKVEdgeConfig.Builder et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null);
if (edgeProp.getBufferSize() != null) {
et3Conf.setAdditionalConfiguration(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, edgeProp.getBufferSize().toString());
}
return et3Conf.build().createDefaultEdgeProperty();
case ONE_TO_ONE_EDGE:
UnorderedKVEdgeConfig et4Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et4Conf.createDefaultOneToOneEdgeProperty();
case XPROD_EDGE:
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(work)) {
if (EdgeType.XPROD_EDGE == tezWork.getEdgeType(parentWork, work)) {
crossProductSources.add(parentWork.getName());
}
}
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
edgeManagerDescriptor.setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf)));
UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, ValueHashPartitioner.class.getName()).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return cpEdgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
case SIMPLE_EDGE:
// fallthrough
default:
assert partitionerClassName != null;
partitionerConf = createPartitionerConf(partitionerClassName, conf);
OrderedPartitionedKVEdgeConfig et5Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
return et5Conf.createDefaultEdgeProperty();
}
}
use of org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param workUnit The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
Vertex vertex;
// simply dispatch the call to the right method for the actual (sub-) type of
// BaseWork.
VertexType vertexType = tezWork.getVertexType(workUnit);
if (workUnit instanceof MapWork) {
vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
} else if (workUnit instanceof ReduceWork) {
vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
} else if (workUnit instanceof MergeJoinWork) {
vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
// set VertexManagerPlugin if whether it's a cross product destination vertex
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(workUnit)) {
if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
crossProductSources.add(parentWork.getName());
}
}
if (!crossProductSources.isEmpty()) {
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
// parallelism shouldn't be set for cartesian product vertex
}
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
vertex.addTaskLocalFiles(localResources);
vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
vertex.setExecutionContext(vertexExecutionContext);
// initialize stats publisher if necessary
if (workUnit.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
final Class outputKlass;
if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
// Hive uses this output format, when it is going to write all its data through FS operator
outputKlass = NullMROutput.class;
} else {
outputKlass = MROutput.class;
}
// If there is a fileSink add a DataSink to the vertex
boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
// final vertices need to have at least one output
boolean endVertex = tezWork.getLeaves().contains(workUnit);
if (endVertex || hasFileSink) {
OutputCommitterDescriptor ocd = null;
String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
if (committer != null && !committer.isEmpty()) {
ocd = OutputCommitterDescriptor.create(committer);
}
vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
}
return vertex;
}
Aggregations