use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testSplitStrategySelection.
@Test
public void testSplitStrategySelection() throws Exception {
conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
final int[] counts = { 1, 10, 100, 256 };
final int[] sizes = { 100, 1000 };
final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
"BISplitStrategy", /* 10 files x 100 size for 1 splits */
"BISplitStrategy", /* 10 files x 100 size for 9 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
"BISplitStrategy", /* 100 files x 100 size for 1 splits */
"BISplitStrategy", /* 100 files x 100 size for 9 splits */
"BISplitStrategy", /* 100 files x 100 size for 10 splits */
"BISplitStrategy", /* 100 files x 100 size for 11 splits */
"BISplitStrategy", /* 100 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
"BISplitStrategy", /* 256 files x 100 size for 1 splits */
"BISplitStrategy", /* 256 files x 100 size for 9 splits */
"BISplitStrategy", /* 256 files x 100 size for 10 splits */
"BISplitStrategy", /* 256 files x 100 size for 11 splits */
"BISplitStrategy", /* 256 files x 100 size for 99 splits */
"BISplitStrategy", /* 256 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
"ETLSplitStrategy" };
int k = 0;
for (int c : counts) {
for (int s : sizes) {
final FileSystem fs = generateMockFiles(c, s);
for (int n : numSplits) {
final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
final SplitStrategy splitStrategy = splitStrategies.get(0);
assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
}
}
}
k = 0;
conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
for (int c : counts) {
for (int s : sizes) {
final FileSystem fs = generateMockFiles(c, s);
for (int n : numSplits) {
final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
final SplitStrategy splitStrategy = splitStrategies.get(0);
assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
}
}
}
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testSplitGenerator.
@Test
public void testSplitGenerator() throws Exception {
// create a file with 5 blocks spread around the cluster
long[] stripeSizes = new long[] { 197, 300, 600, 200, 200, 100, 100, 100, 100, 100 };
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200);
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true);
List<OrcSplit> results = splitter.call();
OrcSplit result = results.get(0);
assertEquals(3, result.getStart());
assertEquals(497, result.getLength());
result = results.get(1);
assertEquals(500, result.getStart());
assertEquals(600, result.getLength());
result = results.get(2);
assertEquals(1100, result.getStart());
assertEquals(400, result.getLength());
result = results.get(3);
assertEquals(1500, result.getStart());
assertEquals(300, result.getLength());
result = results.get(4);
assertEquals(1800, result.getStart());
assertEquals(200, result.getLength());
// test min = 0, max = 0 generates each stripe
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0);
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true);
results = splitter.call();
for (int i = 0; i < stripeSizes.length; ++i) {
assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength());
}
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testACIDSplitStrategy.
@Test
public void testACIDSplitStrategy() throws Exception {
conf.set("bucket_count", "2");
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/delta_000_001/part-00", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_000_001/part-01", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/part-02", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/part-03", 1000, new byte[1], new MockBlock("host1")));
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
ColumnarSplitSizeEstimator splitSizeEstimator = new ColumnarSplitSizeEstimator();
for (OrcSplit split : splits) {
assertEquals(Integer.MAX_VALUE, splitSizeEstimator.getEstimatedSize(split));
}
assertEquals(2, splits.size());
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testProjectedColumnSize.
@Test
public void testProjectedColumnSize() throws Exception {
long[] stripeSizes = new long[] { 200, 200, 200, 200, 100 };
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200);
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true);
List<OrcSplit> results = splitter.call();
OrcSplit result = results.get(0);
assertEquals(3, results.size());
assertEquals(3, result.getStart());
assertEquals(400, result.getLength());
assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(1);
assertEquals(403, result.getStart());
assertEquals(400, result.getLength());
assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(2);
assertEquals(803, result.getStart());
assertEquals(100, result.getLength());
assertEquals(41867, result.getProjectedColumnsUncompressedSize());
// test min = 0, max = 0 generates each stripe
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0);
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true);
results = splitter.call();
assertEquals(5, results.size());
for (int i = 0; i < stripeSizes.length; ++i) {
assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength());
if (i == stripeSizes.length - 1) {
assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize());
} else {
assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize());
}
}
// single split
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 100000);
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true);
results = splitter.call();
assertEquals(1, results.size());
result = results.get(0);
assertEquals(3, result.getStart());
assertEquals(900, result.getLength());
assertEquals(376804, result.getProjectedColumnsUncompressedSize());
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testACIDSplitStrategyForSplitUpdate.
@Test
public void testACIDSplitStrategyForSplitUpdate() throws Exception {
conf.set("bucket_count", "2");
conf.set(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, "true");
conf.set(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES, "default");
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
// Case 1: Test with just originals => Single split strategy with two splits.
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")));
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(2, splits.size());
assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString());
assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString());
assertTrue(splits.get(0).isOriginal());
assertTrue(splits.get(1).isOriginal());
// Case 2: Test with originals and base => Single split strategy with two splits on compacted
// base since the presence of a base will make the originals obsolete.
fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00001", 1000, new byte[1], new MockBlock("host1")));
gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(2, splits.size());
assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString());
assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
assertFalse(splits.get(1).isOriginal());
// Case 3: Test with originals and deltas => Two split strategies with two splits for each.
fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")));
gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
splitStrategies = createSplitStrategies(context, gen);
assertEquals(2, splitStrategies.size());
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(2, splits.size());
assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString());
assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString());
assertTrue(splits.get(0).isOriginal());
assertTrue(splits.get(1).isOriginal());
assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy);
splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(1)).getSplits();
assertEquals(2, splits.size());
assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00001", splits.get(1).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
assertFalse(splits.get(1).isOriginal());
// Case 4: Test with originals and deltas but now with only one bucket covered, i.e. we will
// have originals & insert_deltas for only one bucket, but the delete_deltas will be for two
// buckets => Two strategies with one split for each.
// When split-update is enabled, we do not need to account for buckets that aren't covered.
// The reason why we are able to do so is because the valid user data has already been considered
// as base for the covered buckets. Hence, the uncovered buckets do not have any relevant
// data and we can just ignore them.
fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")));
gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
splitStrategies = createSplitStrategies(context, gen);
assertEquals(2, splitStrategies.size());
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(1, splits.size());
assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString());
assertTrue(splits.get(0).isOriginal());
assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy);
splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(1)).getSplits();
assertEquals(1, splits.size());
assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
// Case 5: Test with originals, compacted_base, insert_deltas, delete_deltas (exhaustive test)
// This should just generate one strategy with splits for base and insert_deltas.
fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000002_0000002_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000002_0000002_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000002_0000002_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000002_0000002_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")));
gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(4, splits.size());
assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString());
assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString());
assertEquals("mock:/a/delta_0000002_0000002_0000/bucket_00000", splits.get(2).getPath().toUri().toString());
assertEquals("mock:/a/delta_0000002_0000002_0000/bucket_00001", splits.get(3).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
assertFalse(splits.get(1).isOriginal());
assertFalse(splits.get(2).isOriginal());
assertFalse(splits.get(3).isOriginal());
}
Aggregations