Search in sources :

Example 6 with Context

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.

the class TestInputOutputFormat method testProjectedColumnSize.

@Test
public void testProjectedColumnSize() throws Exception {
    long[] stripeSizes = new long[] { 200, 200, 200, 200, 100 };
    MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200);
    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
    OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
    List<OrcSplit> results = splitter.call();
    OrcSplit result = results.get(0);
    assertEquals(3, results.size());
    assertEquals(3, result.getStart());
    assertEquals(400, result.getLength());
    assertEquals(167468, result.getProjectedColumnsUncompressedSize());
    result = results.get(1);
    assertEquals(403, result.getStart());
    assertEquals(400, result.getLength());
    assertEquals(167468, result.getProjectedColumnsUncompressedSize());
    result = results.get(2);
    assertEquals(803, result.getStart());
    assertEquals(100, result.getLength());
    assertEquals(41867, result.getProjectedColumnsUncompressedSize());
    // test min = 0, max = 0 generates each stripe
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0);
    context = new OrcInputFormat.Context(conf);
    splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
    results = splitter.call();
    assertEquals(5, results.size());
    for (int i = 0; i < stripeSizes.length; ++i) {
        assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength());
        if (i == stripeSizes.length - 1) {
            assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize());
        } else {
            assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize());
        }
    }
    // single split
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 1000);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 100000);
    context = new OrcInputFormat.Context(conf);
    splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
    results = splitter.call();
    assertEquals(1, results.size());
    result = results.get(0);
    assertEquals(3, result.getStart());
    assertEquals(900, result.getLength());
    assertEquals(376804, result.getProjectedColumnsUncompressedSize());
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) ArrayList(java.util.ArrayList) AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) Test(org.junit.Test)

Example 7 with Context

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.

the class TestInputOutputFormat method testAddSplit.

@Test
public void testAddSplit() throws Exception {
    // create a file with 5 blocks spread around the cluster
    MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(197, 300, 600, 200, 200, 100, 100, 100, 100, 100), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
    OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
    OrcSplit result = splitter.createSplit(0, 200, null);
    assertEquals(0, result.getStart());
    assertEquals(200, result.getLength());
    assertEquals("mock:/a/file", result.getPath().toString());
    String[] locs = result.getLocations();
    assertEquals(3, locs.length);
    assertEquals("host1-1", locs[0]);
    assertEquals("host1-2", locs[1]);
    assertEquals("host1-3", locs[2]);
    result = splitter.createSplit(500, 600, null);
    locs = result.getLocations();
    assertEquals(3, locs.length);
    assertEquals("host2-1", locs[0]);
    assertEquals("host0", locs[1]);
    assertEquals("host2-3", locs[2]);
    result = splitter.createSplit(0, 2500, null);
    locs = result.getLocations();
    assertEquals(1, locs.length);
    assertEquals("host0", locs[0]);
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) ArrayList(java.util.ArrayList) AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) Test(org.junit.Test)

Example 8 with Context

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.

the class TestInputOutputFormat method testACIDSplitStrategy.

@Test
public void testACIDSplitStrategy() throws Exception {
    conf.set("bucket_count", "2");
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
    MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/delta_000_001/bucket_000000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_000_001/bucket_000001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/bucket_000000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/bucket_000001", 1000, new byte[1], new MockBlock("host1")));
    OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
    List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
    assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
    List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
    ColumnarSplitSizeEstimator splitSizeEstimator = new ColumnarSplitSizeEstimator();
    for (OrcSplit split : splits) {
        assertEquals(1, splitSizeEstimator.getEstimatedSize(split));
    }
    assertEquals(4, splits.size());
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) SplitStrategy(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy) ColumnarSplitSizeEstimator(org.apache.hadoop.hive.ql.exec.tez.ColumnarSplitSizeEstimator) Test(org.junit.Test)

Example 9 with Context

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.

the class TestInputOutputFormat method testSplitStrategySelection.

@Test
public void testSplitStrategySelection() throws Exception {
    conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
    conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
    final int[] counts = { 1, 10, 100, 256 };
    final int[] sizes = { 100, 1000 };
    final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
    final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
    "ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
    "ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
    "BISplitStrategy", /* 10 files x 100 size for 1 splits */
    "BISplitStrategy", /* 10 files x 100 size for 9 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
    "ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
    "ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
    "BISplitStrategy", /* 100 files x 100 size for 1 splits */
    "BISplitStrategy", /* 100 files x 100 size for 9 splits */
    "BISplitStrategy", /* 100 files x 100 size for 10 splits */
    "BISplitStrategy", /* 100 files x 100 size for 11 splits */
    "BISplitStrategy", /* 100 files x 100 size for 99 splits */
    "ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
    "ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
    "BISplitStrategy", /* 256 files x 100 size for 1 splits */
    "BISplitStrategy", /* 256 files x 100 size for 9 splits */
    "BISplitStrategy", /* 256 files x 100 size for 10 splits */
    "BISplitStrategy", /* 256 files x 100 size for 11 splits */
    "BISplitStrategy", /* 256 files x 100 size for 99 splits */
    "BISplitStrategy", /* 256 files x 100 size for 111 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
    "ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
    "ETLSplitStrategy" /* 256 files x 1000 size for 111 splits */
    };
    int k = 0;
    for (int c : counts) {
        for (int s : sizes) {
            final FileSystem fs = generateMockFiles(c, s);
            for (int n : numSplits) {
                final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
                OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
                List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
                assertEquals(1, splitStrategies.size());
                final SplitStrategy splitStrategy = splitStrategies.get(0);
                assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
            }
        }
    }
    k = 0;
    conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
    for (int c : counts) {
        for (int s : sizes) {
            final FileSystem fs = generateMockFiles(c, s);
            for (int n : numSplits) {
                final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
                OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
                List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
                assertEquals(1, splitStrategies.size());
                final SplitStrategy splitStrategy = splitStrategies.get(0);
                assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName().equals(strategyResults[k++]));
            }
        }
    }
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) SplitStrategy(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy) Test(org.junit.Test)

Example 10 with Context

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context in project hive by apache.

the class TestInputOutputFormat method testDoAs.

@Test
public void testDoAs() throws Exception {
    conf.setInt(ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS.varname, 1);
    conf.set(ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
    conf.setBoolean(ConfVars.HIVE_IN_TEST.varname, true);
    conf.setClass("fs.mock.impl", MockFileSystem.class, FileSystem.class);
    String badUser = UserGroupInformation.getCurrentUser().getShortUserName() + "-foo";
    MockFileSystem.setBlockedUgi(badUser);
    // TODO: could we instead get FS from path here and add normal files for every UGI?
    MockFileSystem.clearGlobalFiles();
    // We need the size above to take effect.
    OrcInputFormat.Context.resetThreadPool();
    try {
        // OrcInputFormat will get a mock fs from FileSystem.get; add global files.
        MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/1/file", 10000, createMockOrcFile(197, 300, 600), new MockBlock("host1-1", "host1-2", "host1-3")));
        MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/2/file", 10000, createMockOrcFile(197, 300, 600), new MockBlock("host1-1", "host1-2", "host1-3")));
        FileInputFormat.setInputPaths(conf, "mock:/ugi/1");
        UserGroupInformation ugi = UserGroupInformation.createUserForTesting(badUser, new String[0]);
        assertEquals(0, OrcInputFormat.Context.getCurrentThreadPoolSize());
        try {
            ugi.doAs(new PrivilegedExceptionAction<Void>() {

                @Override
                public Void run() throws Exception {
                    OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null));
                    return null;
                }
            });
            fail("Didn't throw");
        } catch (Exception ex) {
            Throwable cause = ex;
            boolean found = false;
            while (cause != null) {
                if (cause instanceof MockFileSystem.MockAccessDenied) {
                    // Expected.
                    found = true;
                    break;
                }
                cause = cause.getCause();
            }
            // Unexpected.
            if (!found)
                throw ex;
        }
        assertEquals(1, OrcInputFormat.Context.getCurrentThreadPoolSize());
        FileInputFormat.setInputPaths(conf, "mock:/ugi/2");
        List<OrcSplit> splits = OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null));
        assertEquals(1, splits.size());
    } finally {
        MockFileSystem.clearGlobalFiles();
    }
}
Also used : Context(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) Test(org.junit.Test)

Aggregations

Context (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context)10 Test (org.junit.Test)10 SplitStrategy (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy)6 ArrayList (java.util.ArrayList)3 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)3 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 ColumnarSplitSizeEstimator (org.apache.hadoop.hive.ql.exec.tez.ColumnarSplitSizeEstimator)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)1