Search in sources :

Example 1 with SortMergeJoinExample

use of org.apache.tez.examples.SortMergeJoinExample in project tez by apache.

the class TestTezJobs method testSortMergeJoinExamplePipeline.

/**
 * test whole {@link SortMergeJoinExample} pipeline as following: <br>
 * {@link JoinDataGen} -> {@link SortMergeJoinExample} -> {@link JoinValidate}
 * @throws Exception
 */
@Test(timeout = 120000)
public void testSortMergeJoinExamplePipeline() throws Exception {
    Path testDir = new Path("/tmp/testSortMergeExample");
    Path stagingDirPath = new Path("/tmp/tez-staging-dir");
    remoteFs.mkdirs(stagingDirPath);
    remoteFs.mkdirs(testDir);
    Path dataPath1 = new Path(testDir, "inPath1");
    Path dataPath2 = new Path(testDir, "inPath2");
    Path expectedOutputPath = new Path(testDir, "expectedOutputPath");
    Path outPath = new Path(testDir, "outPath");
    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDirPath.toString());
    TezClient tezSession = null;
    try {
        tezSession = TezClient.create("SortMergeExampleSession", tezConf, true);
        tezSession.start();
        JoinDataGen dataGen = new JoinDataGen();
        String[] dataGenArgs = new String[] { dataPath1.toString(), "1048576", dataPath2.toString(), "524288", expectedOutputPath.toString(), "2" };
        assertEquals(0, dataGen.run(tezConf, dataGenArgs, tezSession));
        SortMergeJoinExample joinExample = new SortMergeJoinExample();
        String[] args = new String[] { dataPath1.toString(), dataPath2.toString(), "2", outPath.toString() };
        assertEquals(0, joinExample.run(tezConf, args, tezSession));
        JoinValidate joinValidate = new JoinValidate();
        String[] validateArgs = new String[] { expectedOutputPath.toString(), outPath.toString(), "3" };
        assertEquals(0, joinValidate.run(tezConf, validateArgs, tezSession));
    } finally {
        if (tezSession != null) {
            tezSession.stop();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JoinDataGen(org.apache.tez.examples.JoinDataGen) SortMergeJoinExample(org.apache.tez.examples.SortMergeJoinExample) JoinValidate(org.apache.tez.examples.JoinValidate) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezClient(org.apache.tez.client.TezClient) Test(org.junit.Test)

Example 2 with SortMergeJoinExample

use of org.apache.tez.examples.SortMergeJoinExample in project tez by apache.

the class TestTezJobs method testSortMergeJoinExample.

@Test(timeout = 60000)
public void testSortMergeJoinExample() throws Exception {
    SortMergeJoinExample sortMergeJoinExample = new SortMergeJoinExample();
    sortMergeJoinExample.setConf(new Configuration(mrrTezCluster.getConfig()));
    Path stagingDirPath = new Path("/tmp/tez-staging-dir");
    Path inPath1 = new Path("/tmp/sortMerge/inPath1");
    Path inPath2 = new Path("/tmp/sortMerge/inPath2");
    Path outPath = new Path("/tmp/sortMerge/outPath");
    remoteFs.mkdirs(inPath1);
    remoteFs.mkdirs(inPath2);
    remoteFs.mkdirs(stagingDirPath);
    Set<String> expectedResult = new HashSet<String>();
    FSDataOutputStream out1 = remoteFs.create(new Path(inPath1, "file"));
    FSDataOutputStream out2 = remoteFs.create(new Path(inPath2, "file"));
    BufferedWriter writer1 = new BufferedWriter(new OutputStreamWriter(out1));
    BufferedWriter writer2 = new BufferedWriter(new OutputStreamWriter(out2));
    for (int i = 0; i < 20; i++) {
        String term = "term" + i;
        writer1.write(term);
        writer1.newLine();
        if (i % 2 == 0) {
            writer2.write(term);
            writer2.newLine();
            expectedResult.add(term);
        }
    }
    writer1.close();
    writer2.close();
    out1.close();
    out2.close();
    String[] args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(), "-D" + TezConfiguration.TEZ_AM_APPLICATION_PRIORITY + "=" + "2", "-counter", inPath1.toString(), inPath2.toString(), "1", outPath.toString() };
    assertEquals(0, sortMergeJoinExample.run(args));
    FileStatus[] statuses = remoteFs.listStatus(outPath, new PathFilter() {

        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    });
    assertEquals(1, statuses.length);
    FSDataInputStream inStream = remoteFs.open(statuses[0].getPath());
    BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
    String line;
    while ((line = reader.readLine()) != null) {
        assertTrue(expectedResult.remove(line));
    }
    reader.close();
    inStream.close();
    assertEquals(0, expectedResult.size());
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) InputStreamReader(java.io.InputStreamReader) SortMergeJoinExample(org.apache.tez.examples.SortMergeJoinExample) BufferedWriter(java.io.BufferedWriter) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) OutputStreamWriter(java.io.OutputStreamWriter) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with SortMergeJoinExample

use of org.apache.tez.examples.SortMergeJoinExample in project tez by apache.

the class TestTezJobs method testSortMergeJoinExampleDisableSplitGrouping.

@Test(timeout = 60000)
public void testSortMergeJoinExampleDisableSplitGrouping() throws Exception {
    SortMergeJoinExample sortMergeJoinExample = new SortMergeJoinExample();
    sortMergeJoinExample.setConf(conf);
    Path stagingDirPath = new Path(TEST_ROOT_DIR + "/tmp/tez-staging-dir");
    Path inPath1 = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/inPath1");
    Path inPath2 = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/inPath2");
    Path outPath = new Path(TEST_ROOT_DIR + "/tmp/sortMerge/outPath");
    localFs.delete(outPath, true);
    localFs.mkdirs(inPath1);
    localFs.mkdirs(inPath2);
    localFs.mkdirs(stagingDirPath);
    Set<String> expectedResult = new HashSet<String>();
    FSDataOutputStream out1 = localFs.create(new Path(inPath1, "file"));
    FSDataOutputStream out2 = localFs.create(new Path(inPath2, "file"));
    BufferedWriter writer1 = new BufferedWriter(new OutputStreamWriter(out1));
    BufferedWriter writer2 = new BufferedWriter(new OutputStreamWriter(out2));
    for (int i = 0; i < 20; i++) {
        String term = "term" + i;
        writer1.write(term);
        writer1.newLine();
        if (i % 2 == 0) {
            writer2.write(term);
            writer2.newLine();
            expectedResult.add(term);
        }
    }
    writer1.close();
    writer2.close();
    out1.close();
    out2.close();
    String[] args = new String[] { "-D" + TezConfiguration.TEZ_AM_STAGING_DIR + "=" + stagingDirPath.toString(), "-counter", "-local", "-disableSplitGrouping", inPath1.toString(), inPath2.toString(), "1", outPath.toString() };
    assertEquals(0, sortMergeJoinExample.run(args));
    FileStatus[] statuses = localFs.listStatus(outPath, new PathFilter() {

        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    });
    assertEquals(1, statuses.length);
    FSDataInputStream inStream = localFs.open(statuses[0].getPath());
    BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
    String line;
    while ((line = reader.readLine()) != null) {
        assertTrue(expectedResult.remove(line));
    }
    reader.close();
    inStream.close();
    assertEquals(0, expectedResult.size());
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) SortMergeJoinExample(org.apache.tez.examples.SortMergeJoinExample) BufferedWriter(java.io.BufferedWriter) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) OutputStreamWriter(java.io.OutputStreamWriter) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Path (org.apache.hadoop.fs.Path)3 SortMergeJoinExample (org.apache.tez.examples.SortMergeJoinExample)3 Test (org.junit.Test)3 BufferedReader (java.io.BufferedReader)2 BufferedWriter (java.io.BufferedWriter)2 InputStreamReader (java.io.InputStreamReader)2 OutputStreamWriter (java.io.OutputStreamWriter)2 HashSet (java.util.HashSet)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 PathFilter (org.apache.hadoop.fs.PathFilter)2 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)2 Configuration (org.apache.hadoop.conf.Configuration)1 TezClient (org.apache.tez.client.TezClient)1 JoinDataGen (org.apache.tez.examples.JoinDataGen)1 JoinValidate (org.apache.tez.examples.JoinValidate)1