Search in sources :

Example 1 with HashJoinExample

use of org.apache.tez.examples.HashJoinExample in project tez by apache.

the class TestExtServicesWithLocalMode method test1.

@Test(timeout = 30000)
public void test1() throws Exception {
    UserPayload userPayload = TezUtils.createUserPayloadFromConf(confForJobs);
    TaskSchedulerDescriptor[] taskSchedulerDescriptors = new TaskSchedulerDescriptor[] { TaskSchedulerDescriptor.create(EXT_PUSH_ENTITY_NAME, TezTestServiceTaskSchedulerService.class.getName()).setUserPayload(userPayload) };
    ContainerLauncherDescriptor[] containerLauncherDescriptors = new ContainerLauncherDescriptor[] { ContainerLauncherDescriptor.create(EXT_PUSH_ENTITY_NAME, TezTestServiceNoOpContainerLauncher.class.getName()).setUserPayload(userPayload) };
    TaskCommunicatorDescriptor[] taskCommunicatorDescriptors = new TaskCommunicatorDescriptor[] { TaskCommunicatorDescriptor.create(EXT_PUSH_ENTITY_NAME, TezTestServiceTaskCommunicatorImpl.class.getName()).setUserPayload(userPayload) };
    ServicePluginsDescriptor servicePluginsDescriptor = ServicePluginsDescriptor.create(true, false, taskSchedulerDescriptors, containerLauncherDescriptors, taskCommunicatorDescriptors);
    TezConfiguration tezConf = new TezConfiguration(confForJobs);
    TezClient tezClient = TezClient.newBuilder("test1", tezConf).setIsSession(true).setServicePluginDescriptor(servicePluginsDescriptor).build();
    try {
        tezClient.start();
        Path dataPath1 = new Path(SRC_DATA_DIR, "inPath1");
        Path dataPath2 = new Path(SRC_DATA_DIR, "inPath2");
        Path expectedResultPath = new Path(SRC_DATA_DIR, "expectedOutputPath");
        JoinDataGen dataGen = new JoinDataGen();
        String[] dataGenArgs = new String[] { dataPath1.toString(), "1048576", dataPath2.toString(), "524288", expectedResultPath.toString(), "2" };
        assertEquals(0, dataGen.run(tezConf, dataGenArgs, tezClient));
        Path outputPath = new Path(SRC_DATA_DIR, "outPath");
        HashJoinExample joinExample = new HashJoinExample();
        String[] args = new String[] { dataPath1.toString(), dataPath2.toString(), "2", outputPath.toString() };
        assertEquals(0, joinExample.run(tezConf, args, tezClient));
        LOG.info("Completed generating Data - Expected Hash Result and Actual Join Result");
        assertEquals(0, tezTestServiceCluster.getNumSubmissions());
        // ext can consume from ext.
        runJoinValidate(tezClient, "allInExt", 7, EXECUTION_CONTEXT_EXT_SERVICE_PUSH, EXECUTION_CONTEXT_EXT_SERVICE_PUSH, EXECUTION_CONTEXT_EXT_SERVICE_PUSH);
        LOG.info("Completed allInExt");
        // uber can consume from uber.
        runJoinValidate(tezClient, "noneInExt", 0, null, null, null);
        LOG.info("Completed noneInExt");
        // uber can consume from ext
        runJoinValidate(tezClient, "lhsInExt", 2, EXECUTION_CONTEXT_EXT_SERVICE_PUSH, null, null);
        LOG.info("Completed lhsInExt");
    // ext cannot consume from uber in this mode since there's no shuffle handler working,
    // and the local data transfer semantics may not match.
    } finally {
        tezClient.stop();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) TaskSchedulerDescriptor(org.apache.tez.serviceplugins.api.TaskSchedulerDescriptor) ServicePluginsDescriptor(org.apache.tez.serviceplugins.api.ServicePluginsDescriptor) TaskCommunicatorDescriptor(org.apache.tez.serviceplugins.api.TaskCommunicatorDescriptor) TezClient(org.apache.tez.client.TezClient) ContainerLauncherDescriptor(org.apache.tez.serviceplugins.api.ContainerLauncherDescriptor) JoinDataGen(org.apache.tez.examples.JoinDataGen) HashJoinExample(org.apache.tez.examples.HashJoinExample) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Test(org.junit.Test)

Example 2 with HashJoinExample

use of org.apache.tez.examples.HashJoinExample in project tez by apache.

the class ExternalTezServiceTestHelper method setupHashJoinData.

public void setupHashJoinData(Path srcDataDir, Path dataPath1, Path dataPath2, Path expectedResultPath, Path outputPath) throws Exception {
    remoteFs.mkdirs(srcDataDir);
    TezConfiguration tezConf = new TezConfiguration(confForJobs);
    // Generate join data - with 2 tasks.
    JoinDataGen dataGen = new JoinDataGen();
    String[] dataGenArgs = new String[] { dataPath1.toString(), "1048576", dataPath2.toString(), "524288", expectedResultPath.toString(), "2" };
    assertEquals(0, dataGen.run(tezConf, dataGenArgs, sharedTezClient));
    // Run the actual join - with 2 reducers
    HashJoinExample joinExample = new HashJoinExample();
    String[] args = new String[] { dataPath1.toString(), dataPath2.toString(), "2", outputPath.toString() };
    assertEquals(0, joinExample.run(tezConf, args, sharedTezClient));
    LOG.info("Completed generating Data - Expected Hash Result and Actual Join Result");
}
Also used : JoinDataGen(org.apache.tez.examples.JoinDataGen) HashJoinExample(org.apache.tez.examples.HashJoinExample) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 3 with HashJoinExample

use of org.apache.tez.examples.HashJoinExample in project tez by apache.

the class TestRecovery method testHashJoinExample.

private void testHashJoinExample(SimpleShutdownCondition shutdownCondition, boolean enableAutoParallelism, boolean generateSplitInClient) throws Exception {
    HashJoinExample hashJoinExample = new HashJoinExample();
    TezConfiguration tezConf = new TezConfiguration(miniTezCluster.getConfig());
    tezConf.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, 4);
    tezConf.set(TezConfiguration.TEZ_AM_RECOVERY_SERVICE_CLASS, RecoveryServiceWithEventHandlingHook.class.getName());
    tezConf.set(RecoveryServiceWithEventHandlingHook.AM_RECOVERY_SERVICE_HOOK_CLASS, SimpleRecoveryEventHook.class.getName());
    tezConf.set(SimpleRecoveryEventHook.SIMPLE_SHUTDOWN_CONDITION, shutdownCondition.serialize());
    tezConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, enableAutoParallelism);
    tezConf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, false);
    tezConf.setBoolean(TezConfiguration.TEZ_AM_STAGING_SCRATCH_DATA_AUTO_DELETE, false);
    tezConf.set(TezConfiguration.TEZ_AM_LOG_LEVEL, "INFO;org.apache.tez=DEBUG");
    hashJoinExample.setConf(tezConf);
    Path stagingDirPath = new Path("/tmp/tez-staging-dir");
    Path inPath1 = new Path("/tmp/hashJoin/inPath1");
    Path inPath2 = new Path("/tmp/hashJoin/inPath2");
    Path outPath = new Path("/tmp/hashJoin/outPath");
    remoteFs.delete(outPath, true);
    remoteFs.mkdirs(inPath1);
    remoteFs.mkdirs(inPath2);
    remoteFs.mkdirs(stagingDirPath);
    Set<String> expectedResult = new HashSet<String>();
    FSDataOutputStream out1 = remoteFs.create(new Path(inPath1, "file"));
    FSDataOutputStream out2 = remoteFs.create(new Path(inPath2, "file"));
    BufferedWriter writer1 = new BufferedWriter(new OutputStreamWriter(out1));
    BufferedWriter writer2 = new BufferedWriter(new OutputStreamWriter(out2));
    for (int i = 0; i < 20; i++) {
        String term = "term" + i;
        writer1.write(term);
        writer1.newLine();
        if (i % 2 == 0) {
            writer2.write(term);
            writer2.newLine();
            expectedResult.add(term);
        }
    }
    writer1.close();
    writer2.close();
    out1.close();
    out2.close();
    String[] args = null;
    if (generateSplitInClient) {
        args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(), "-generateSplitInClient", inPath1.toString(), inPath2.toString(), "1", outPath.toString() };
    } else {
        args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(), inPath1.toString(), inPath2.toString(), "1", outPath.toString() };
    }
    assertEquals(0, hashJoinExample.run(args));
    FileStatus[] statuses = remoteFs.listStatus(outPath, new PathFilter() {

        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    });
    assertEquals(1, statuses.length);
    FSDataInputStream inStream = remoteFs.open(statuses[0].getPath());
    BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
    String line;
    while ((line = reader.readLine()) != null) {
        assertTrue(expectedResult.remove(line));
    }
    reader.close();
    inStream.close();
    assertEquals(0, expectedResult.size());
    List<HistoryEvent> historyEventsOfAttempt1 = RecoveryParser.readRecoveryEvents(tezConf, hashJoinExample.getAppId(), 1);
    HistoryEvent lastEvent = historyEventsOfAttempt1.get(historyEventsOfAttempt1.size() - 1);
    assertEquals(shutdownCondition.getEvent().getEventType(), lastEvent.getEventType());
    assertTrue(shutdownCondition.match(lastEvent));
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) HistoryEvent(org.apache.tez.dag.history.HistoryEvent) SimpleRecoveryEventHook(org.apache.tez.test.RecoveryServiceWithEventHandlingHook.SimpleRecoveryEventHook) BufferedWriter(java.io.BufferedWriter) HashJoinExample(org.apache.tez.examples.HashJoinExample) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) OutputStreamWriter(java.io.OutputStreamWriter) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) HashSet(java.util.HashSet)

Example 4 with HashJoinExample

use of org.apache.tez.examples.HashJoinExample in project tez by apache.

the class TestTezJobs method testHashJoinExample.

@Test(timeout = 60000)
public void testHashJoinExample() throws Exception {
    HashJoinExample hashJoinExample = new HashJoinExample();
    hashJoinExample.setConf(mrrTezCluster.getConfig());
    Path stagingDirPath = new Path("/tmp/tez-staging-dir");
    Path inPath1 = new Path("/tmp/hashJoin/inPath1");
    Path inPath2 = new Path("/tmp/hashJoin/inPath2");
    Path outPath = new Path("/tmp/hashJoin/outPath");
    remoteFs.mkdirs(inPath1);
    remoteFs.mkdirs(inPath2);
    remoteFs.mkdirs(stagingDirPath);
    Set<String> expectedResult = new HashSet<String>();
    FSDataOutputStream out1 = remoteFs.create(new Path(inPath1, "file"));
    FSDataOutputStream out2 = remoteFs.create(new Path(inPath2, "file"));
    BufferedWriter writer1 = new BufferedWriter(new OutputStreamWriter(out1));
    BufferedWriter writer2 = new BufferedWriter(new OutputStreamWriter(out2));
    for (int i = 0; i < 20; i++) {
        String term = "term" + i;
        writer1.write(term);
        writer1.newLine();
        if (i % 2 == 0) {
            writer2.write(term);
            writer2.newLine();
            expectedResult.add(term);
        }
    }
    writer1.close();
    writer2.close();
    out1.close();
    out2.close();
    String[] args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(), "-counter", inPath1.toString(), inPath2.toString(), "1", outPath.toString() };
    assertEquals(0, hashJoinExample.run(args));
    FileStatus[] statuses = remoteFs.listStatus(outPath, new PathFilter() {

        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    });
    assertEquals(1, statuses.length);
    FSDataInputStream inStream = remoteFs.open(statuses[0].getPath());
    BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
    String line;
    while ((line = reader.readLine()) != null) {
        assertTrue(expectedResult.remove(line));
    }
    reader.close();
    inStream.close();
    assertEquals(0, expectedResult.size());
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) BufferedWriter(java.io.BufferedWriter) HashJoinExample(org.apache.tez.examples.HashJoinExample) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) OutputStreamWriter(java.io.OutputStreamWriter) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 5 with HashJoinExample

use of org.apache.tez.examples.HashJoinExample in project tez by apache.

the class TestTezJobs method testHashJoinExamplePipeline.

/**
 * test whole {@link HashJoinExample} pipeline as following: <br>
 * {@link JoinDataGen} -> {@link HashJoinExample} -> {@link JoinValidate}
 * @throws Exception
 */
@Test(timeout = 120000)
public void testHashJoinExamplePipeline() throws Exception {
    Path testDir = new Path("/tmp/testHashJoinExample");
    Path stagingDirPath = new Path("/tmp/tez-staging-dir");
    remoteFs.mkdirs(stagingDirPath);
    remoteFs.mkdirs(testDir);
    Path dataPath1 = new Path(testDir, "inPath1");
    Path dataPath2 = new Path(testDir, "inPath2");
    Path expectedOutputPath = new Path(testDir, "expectedOutputPath");
    Path outPath = new Path(testDir, "outPath");
    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDirPath.toString());
    TezClient tezSession = null;
    try {
        tezSession = TezClient.create("HashJoinExampleSession", tezConf, true);
        tezSession.start();
        JoinDataGen dataGen = new JoinDataGen();
        String[] dataGenArgs = new String[] { "-counter", dataPath1.toString(), "1048576", dataPath2.toString(), "524288", expectedOutputPath.toString(), "2" };
        assertEquals(0, dataGen.run(tezConf, dataGenArgs, tezSession));
        HashJoinExample joinExample = new HashJoinExample();
        String[] args = new String[] { dataPath1.toString(), dataPath2.toString(), "2", outPath.toString() };
        assertEquals(0, joinExample.run(tezConf, args, tezSession));
        JoinValidate joinValidate = new JoinValidate();
        String[] validateArgs = new String[] { "-counter", expectedOutputPath.toString(), outPath.toString(), "3" };
        assertEquals(0, joinValidate.run(tezConf, validateArgs, tezSession));
    } finally {
        if (tezSession != null) {
            tezSession.stop();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JoinDataGen(org.apache.tez.examples.JoinDataGen) HashJoinExample(org.apache.tez.examples.HashJoinExample) JoinValidate(org.apache.tez.examples.JoinValidate) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezClient(org.apache.tez.client.TezClient) Test(org.junit.Test)

Aggregations

HashJoinExample (org.apache.tez.examples.HashJoinExample)6 Path (org.apache.hadoop.fs.Path)5 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)4 Test (org.junit.Test)4 BufferedReader (java.io.BufferedReader)3 BufferedWriter (java.io.BufferedWriter)3 InputStreamReader (java.io.InputStreamReader)3 OutputStreamWriter (java.io.OutputStreamWriter)3 HashSet (java.util.HashSet)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)3 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 PathFilter (org.apache.hadoop.fs.PathFilter)3 JoinDataGen (org.apache.tez.examples.JoinDataGen)3 TezClient (org.apache.tez.client.TezClient)2 UserPayload (org.apache.tez.dag.api.UserPayload)1 HistoryEvent (org.apache.tez.dag.history.HistoryEvent)1 JoinValidate (org.apache.tez.examples.JoinValidate)1 ContainerLauncherDescriptor (org.apache.tez.serviceplugins.api.ContainerLauncherDescriptor)1 ServicePluginsDescriptor (org.apache.tez.serviceplugins.api.ServicePluginsDescriptor)1