use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testProjectedColumnSize.
@Test
public void testProjectedColumnSize() throws Exception {
long[] stripeSizes = new long[] { 200, 200, 200, 200, 100 };
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200);
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
List<OrcSplit> results = splitter.call();
OrcSplit result = results.get(0);
assertEquals(3, results.size());
assertEquals(3, result.getStart());
assertEquals(400, result.getLength());
assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(1);
assertEquals(403, result.getStart());
assertEquals(400, result.getLength());
assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(2);
assertEquals(803, result.getStart());
assertEquals(100, result.getLength());
assertEquals(41867, result.getProjectedColumnsUncompressedSize());
// test min = 0, max = 0 generates each stripe
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0);
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
results = splitter.call();
assertEquals(5, results.size());
for (int i = 0; i < stripeSizes.length; ++i) {
assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength());
if (i == stripeSizes.length - 1) {
assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize());
} else {
assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize());
}
}
// single split
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 100000);
context = new OrcInputFormat.Context(conf);
splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
results = splitter.call();
assertEquals(1, results.size());
result = results.get(0);
assertEquals(3, result.getStart());
assertEquals(900, result.getLength());
assertEquals(376804, result.getProjectedColumnsUncompressedSize());
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testAddSplit.
@Test
public void testAddSplit() throws Exception {
// create a file with 5 blocks spread around the cluster
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(197, 300, 600, 200, 200, 100, 100, 100, 100, 100), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
OrcSplit result = splitter.createSplit(0, 200, null);
assertEquals(0, result.getStart());
assertEquals(200, result.getLength());
assertEquals("mock:/a/file", result.getPath().toString());
String[] locs = result.getLocations();
assertEquals(3, locs.length);
assertEquals("host1-1", locs[0]);
assertEquals("host1-2", locs[1]);
assertEquals("host1-3", locs[2]);
result = splitter.createSplit(500, 600, null);
locs = result.getLocations();
assertEquals(3, locs.length);
assertEquals("host2-1", locs[0]);
assertEquals("host0", locs[1]);
assertEquals("host2-3", locs[2]);
result = splitter.createSplit(0, 2500, null);
locs = result.getLocations();
assertEquals(1, locs.length);
assertEquals("host0", locs[0]);
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testACIDSplitStrategy.
@Test
public void testACIDSplitStrategy() throws Exception {
conf.set("bucket_count", "2");
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/delta_000_001/bucket_000000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_000_001/bucket_000001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/bucket_000000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/bucket_000001", 1000, new byte[1], new MockBlock("host1")));
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
ColumnarSplitSizeEstimator splitSizeEstimator = new ColumnarSplitSizeEstimator();
for (OrcSplit split : splits) {
assertEquals(1, splitSizeEstimator.getEstimatedSize(split));
}
assertEquals(4, splits.size());
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testSplitStrategySelection.
@Test
public void testSplitStrategySelection() throws Exception {
conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
final int[] counts = { 1, 10, 100, 256 };
final int[] sizes = { 100, 1000 };
final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
"BISplitStrategy", /* 10 files x 100 size for 1 splits */
"BISplitStrategy", /* 10 files x 100 size for 9 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
"BISplitStrategy", /* 100 files x 100 size for 1 splits */
"BISplitStrategy", /* 100 files x 100 size for 9 splits */
"BISplitStrategy", /* 100 files x 100 size for 10 splits */
"BISplitStrategy", /* 100 files x 100 size for 11 splits */
"BISplitStrategy", /* 100 files x 100 size for 99 splits */
"ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
"ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
"BISplitStrategy", /* 256 files x 100 size for 1 splits */
"BISplitStrategy", /* 256 files x 100 size for 9 splits */
"BISplitStrategy", /* 256 files x 100 size for 10 splits */
"BISplitStrategy", /* 256 files x 100 size for 11 splits */
"BISplitStrategy", /* 256 files x 100 size for 99 splits */
"BISplitStrategy", /* 256 files x 100 size for 111 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
"ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
"ETLSplitStrategy" /* 256 files x 1000 size for 111 splits */
};
int k = 0;
for (int c : counts) {
for (int s : sizes) {
final FileSystem fs = generateMockFiles(c, s);
for (int n : numSplits) {
final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
final SplitStrategy splitStrategy = splitStrategies.get(0);
assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
}
}
}
k = 0;
conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
for (int c : counts) {
for (int s : sizes) {
final FileSystem fs = generateMockFiles(c, s);
for (int n : numSplits) {
final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
assertEquals(1, splitStrategies.size());
final SplitStrategy splitStrategy = splitStrategies.get(0);
assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
}
}
}
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.
the class TestInputOutputFormat method testDoAs.
@Test
public void testDoAs() throws Exception {
conf.setInt(ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS.varname, 1);
conf.set(ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
conf.setBoolean(ConfVars.HIVE_IN_TEST.varname, true);
conf.setClass("fs.mock.impl", MockFileSystem.class, FileSystem.class);
String badUser = UserGroupInformation.getCurrentUser().getShortUserName() + "-foo";
MockFileSystem.setBlockedUgi(badUser);
// TODO: could we instead get FS from path here and add normal files for every UGI?
MockFileSystem.clearGlobalFiles();
// We need the size above to take effect.
OrcInputFormat.Context.resetThreadPool();
try {
// OrcInputFormat will get a mock fs from FileSystem.get; add global files.
MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/1/file", 10000, createMockOrcFile(197, 300, 600), new MockBlock("host1-1", "host1-2", "host1-3")));
MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/2/file", 10000, createMockOrcFile(197, 300, 600), new MockBlock("host1-1", "host1-2", "host1-3")));
FileInputFormat.setInputPaths(conf, "mock:/ugi/1");
UserGroupInformation ugi = UserGroupInformation.createUserForTesting(badUser, new String[0]);
assertEquals(0, OrcInputFormat.Context.getCurrentThreadPoolSize());
try {
ugi.doAs(new PrivilegedExceptionAction<Void>() {
@Override
public Void run() throws Exception {
OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null));
return null;
}
});
fail("Didn't throw");
} catch (Exception ex) {
Throwable cause = ex;
boolean found = false;
while (cause != null) {
if (cause instanceof MockFileSystem.MockAccessDenied) {
// Expected.
found = true;
break;
}
cause = cause.getCause();
}
// Unexpected.
if (!found)
throw ex;
}
assertEquals(1, OrcInputFormat.Context.getCurrentThreadPoolSize());
FileInputFormat.setInputPaths(conf, "mock:/ugi/2");
List<OrcSplit> splits = OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null));
assertEquals(1, splits.size());
} finally {
MockFileSystem.clearGlobalFiles();
}
}
Aggregations