use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy in project hive by apache.
the class TestInputOutputFormat method testSplitStrategySelection.
@Test
public void testSplitStrategySelection() throws Exception {
conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
final int[] counts = { 1, 10, 100, 256 };
final int[] sizes = { 100, 1000 };
final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
"BISplitStrategy", /* 10 files x 100 size for 1 splits */
"BISplitStrategy", /* 10 files x 100 size for 9 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
"BISplitStrategy", /* 100 files x 100 size for 1 splits */
"BISplitStrategy", /* 100 files x 100 size for 9 splits */
"BISplitStrategy", /* 100 files x 100 size for 10 splits */
"BISplitStrategy", /* 100 files x 100 size for 11 splits */
"BISplitStrategy", /* 100 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
"BISplitStrategy", /* 256 files x 100 size for 1 splits */
"BISplitStrategy", /* 256 files x 100 size for 9 splits */
"BISplitStrategy", /* 256 files x 100 size for 10 splits */
"BISplitStrategy", /* 256 files x 100 size for 11 splits */
"BISplitStrategy", /* 256 files x 100 size for 99 splits */
"BISplitStrategy", /* 256 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
"ETLSplitStrategy" };
int k = 0;
for (int c : counts) {
for (int s : sizes) {
final FileSystem fs = generateMockFiles(c, s);
for (int n : numSplits) {
final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
final SplitStrategy splitStrategy = splitStrategies.get(0);
assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
}
}
}
k = 0;
conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
for (int c : counts) {
for (int s : sizes) {
final FileSystem fs = generateMockFiles(c, s);
for (int n : numSplits) {
final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
final SplitStrategy splitStrategy = splitStrategies.get(0);
assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
}
}
}
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy in project hive by apache.
the class TestInputOutputFormat method testEtlCombinedStrategy.
@Test
public void testEtlCombinedStrategy() throws Exception {
conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/1/part-00", 1000, new byte[1]), new MockFile("mock:/a/1/part-01", 1000, new byte[1]), new MockFile("mock:/a/2/part-00", 1000, new byte[1]), new MockFile("mock:/a/2/part-01", 1000, new byte[1]), new MockFile("mock:/a/3/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/4/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/5/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[1]));
OrcInputFormat.CombinedCtx combineCtx = new OrcInputFormat.CombinedCtx();
// The first directory becomes the base for combining.
List<SplitStrategy<?>> ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx);
assertTrue(ss.isEmpty());
assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
OrcInputFormat.ETLSplitStrategy etlSs = combineCtx.combined;
assertEquals(2, etlSs.files.size());
assertTrue(etlSs.isOriginal);
assertEquals(1, etlSs.dirs.size());
// The second one should be combined into the first.
ss = createOrCombineStrategies(context, fs, "mock:/a/2", combineCtx);
assertTrue(ss.isEmpty());
assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
assertEquals(4, etlSs.files.size());
assertEquals(2, etlSs.dirs.size());
// The third one has the base file, so it shouldn't be combined but could be a base.
ss = createOrCombineStrategies(context, fs, "mock:/a/3", combineCtx);
assertEquals(1, ss.size());
assertSame(etlSs, ss.get(0));
assertEquals(4, etlSs.files.size());
assertEquals(2, etlSs.dirs.size());
assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
etlSs = combineCtx.combined;
assertEquals(1, etlSs.files.size());
assertFalse(etlSs.isOriginal);
assertEquals(1, etlSs.dirs.size());
// Try the first again, it would not be combined and we'd retain the old base (less files).
ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx);
assertEquals(1, ss.size());
assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy);
assertNotSame(etlSs, ss.get(0));
OrcInputFormat.ETLSplitStrategy rejectedEtlSs = (OrcInputFormat.ETLSplitStrategy) ss.get(0);
assertEquals(2, rejectedEtlSs.files.size());
assertEquals(1, rejectedEtlSs.dirs.size());
assertTrue(rejectedEtlSs.isOriginal);
assertEquals(1, etlSs.files.size());
assertEquals(1, etlSs.dirs.size());
// The fourth could be combined again.
ss = createOrCombineStrategies(context, fs, "mock:/a/4", combineCtx);
assertTrue(ss.isEmpty());
assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
assertEquals(2, etlSs.files.size());
assertEquals(2, etlSs.dirs.size());
// The fifth will not be combined because of delta files.
ss = createOrCombineStrategies(context, fs, "mock:/a/5", combineCtx);
assertEquals(1, ss.size());
assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy);
assertNotSame(etlSs, ss);
assertEquals(2, etlSs.files.size());
assertEquals(2, etlSs.dirs.size());
}
Aggregations