Search in sources :

Example 1 with SplitStrategy

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy in project hive by apache.

the class TestInputOutputFormat method testSplitStrategySelection.

@Test
public void testSplitStrategySelection() throws Exception {
    conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
    conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
    final int[] counts = { 1, 10, 100, 256 };
    final int[] sizes = { 100, 1000 };
    final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
    final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
    "BISplitStrategy", /* 10 files x 100 size for 1 splits */
    "BISplitStrategy", /* 10 files x 100 size for 9 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
    "BISplitStrategy", /* 100 files x 100 size for 1 splits */
    "BISplitStrategy", /* 100 files x 100 size for 9 splits */
    "BISplitStrategy", /* 100 files x 100 size for 10 splits */
    "BISplitStrategy", /* 100 files x 100 size for 11 splits */
    "BISplitStrategy", /* 100 files x 100 size for 99 splits */
    "ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
    "BISplitStrategy", /* 256 files x 100 size for 1 splits */
    "BISplitStrategy", /* 256 files x 100 size for 9 splits */
    "BISplitStrategy", /* 256 files x 100 size for 10 splits */
    "BISplitStrategy", /* 256 files x 100 size for 11 splits */
    "BISplitStrategy", /* 256 files x 100 size for 99 splits */
    "BISplitStrategy", /* 256 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
    "ETLSplitStrategy" };
    int k = 0;
    for (int c : counts) {
        for (int s : sizes) {
            final FileSystem fs = generateMockFiles(c, s);
            for (int n : numSplits) {
                final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
                OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
                List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
                assertEquals(1, splitStrategies.size());
                final SplitStrategy splitStrategy = splitStrategies.get(0);
                assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
            }
        }
    }
    k = 0;
    conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
    for (int c : counts) {
        for (int s : sizes) {
            final FileSystem fs = generateMockFiles(c, s);
            for (int n : numSplits) {
                final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
                OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
                List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
                assertEquals(1, splitStrategies.size());
                final SplitStrategy splitStrategy = splitStrategies.get(0);
                assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
            }
        }
    }
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) FileSystem(org.apache.hadoop.fs.FileSystem) SplitStrategy(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy) Test(org.junit.Test)

Example 2 with SplitStrategy

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy in project hive by apache.

the class TestInputOutputFormat method testEtlCombinedStrategy.

@Test
public void testEtlCombinedStrategy() throws Exception {
    conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
    conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");
    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
    MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/1/part-00", 1000, new byte[1]), new MockFile("mock:/a/1/part-01", 1000, new byte[1]), new MockFile("mock:/a/2/part-00", 1000, new byte[1]), new MockFile("mock:/a/2/part-01", 1000, new byte[1]), new MockFile("mock:/a/3/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/4/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/5/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[1]));
    OrcInputFormat.CombinedCtx combineCtx = new OrcInputFormat.CombinedCtx();
    // The first directory becomes the base for combining.
    List<SplitStrategy<?>> ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx);
    assertTrue(ss.isEmpty());
    assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
    OrcInputFormat.ETLSplitStrategy etlSs = combineCtx.combined;
    assertEquals(2, etlSs.files.size());
    assertTrue(etlSs.isOriginal);
    assertEquals(1, etlSs.dirs.size());
    // The second one should be combined into the first.
    ss = createOrCombineStrategies(context, fs, "mock:/a/2", combineCtx);
    assertTrue(ss.isEmpty());
    assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
    assertEquals(4, etlSs.files.size());
    assertEquals(2, etlSs.dirs.size());
    // The third one has the base file, so it shouldn't be combined but could be a base.
    ss = createOrCombineStrategies(context, fs, "mock:/a/3", combineCtx);
    assertEquals(1, ss.size());
    assertSame(etlSs, ss.get(0));
    assertEquals(4, etlSs.files.size());
    assertEquals(2, etlSs.dirs.size());
    assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
    etlSs = combineCtx.combined;
    assertEquals(1, etlSs.files.size());
    assertFalse(etlSs.isOriginal);
    assertEquals(1, etlSs.dirs.size());
    // Try the first again, it would not be combined and we'd retain the old base (less files).
    ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx);
    assertEquals(1, ss.size());
    assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy);
    assertNotSame(etlSs, ss.get(0));
    OrcInputFormat.ETLSplitStrategy rejectedEtlSs = (OrcInputFormat.ETLSplitStrategy) ss.get(0);
    assertEquals(2, rejectedEtlSs.files.size());
    assertEquals(1, rejectedEtlSs.dirs.size());
    assertTrue(rejectedEtlSs.isOriginal);
    assertEquals(1, etlSs.files.size());
    assertEquals(1, etlSs.dirs.size());
    // The fourth could be combined again.
    ss = createOrCombineStrategies(context, fs, "mock:/a/4", combineCtx);
    assertTrue(ss.isEmpty());
    assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
    assertEquals(2, etlSs.files.size());
    assertEquals(2, etlSs.dirs.size());
    // The fifth will not be combined because of delta files.
    ss = createOrCombineStrategies(context, fs, "mock:/a/5", combineCtx);
    assertEquals(1, ss.size());
    assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy);
    assertNotSame(etlSs, ss);
    assertEquals(2, etlSs.files.size());
    assertEquals(2, etlSs.dirs.size());
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) SplitStrategy(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy) Test(org.junit.Test)

Aggregations

Context (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context)2 SplitStrategy (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy)2 Test (org.junit.Test)2 FileSystem (org.apache.hadoop.fs.FileSystem)1