Examples with ContentSummary - org.apache.hadoop.fs.ContentSummary

Example 1 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class IndexWhereProcessor method rewriteForIndexes.

/**
   * Get a list of Tasks to activate use of tsToIndices.
   * Generate the tasks for the index query (where we store results of
   * querying the index in a tmp file) inside the IndexHandler
   * @param predicate Predicate of query to rewrite
   * @param index Index to use for rewrite
   * @param pctx
   * @param task original task before rewrite
   * @param queryContext stores return values
   */
private void rewriteForIndexes(ExprNodeDesc predicate, List<Index> indexes, ParseContext pctx, Task<MapredWork> task, HiveIndexQueryContext queryContext) throws SemanticException {
    HiveIndexHandler indexHandler;
    // All tsToIndices in the list are of the same type, and therefore can use the
    // same handler to generate the index query tasks
    Index index = indexes.get(0);
    try {
        indexHandler = HiveUtils.getIndexHandler(pctx.getConf(), index.getIndexHandlerClass());
    } catch (HiveException e) {
        LOG.error("Exception while loading IndexHandler: " + index.getIndexHandlerClass(), e);
        throw new SemanticException("Failed to load indexHandler: " + index.getIndexHandlerClass(), e);
    }
    // check the size
    try {
        ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null);
        long inputSize = inputSummary.getLength();
        if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) {
            queryContext.setQueryTasks(null);
            return;
        }
    } catch (IOException e) {
        throw new SemanticException("Failed to get task size", e);
    }
    // use the IndexHandler to generate the index query
    indexHandler.generateIndexQuery(indexes, predicate, pctx, queryContext);
    return;
}

Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HiveIndexHandler(org.apache.hadoop.hive.ql.index.HiveIndexHandler) ContentSummary(org.apache.hadoop.fs.ContentSummary) Index(org.apache.hadoop.hive.metastore.api.Index) IOException(java.io.IOException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 2 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class AbstractJoinTaskDispatcher method getTotalKnownInputSize.

public long getTotalKnownInputSize(Context context, MapWork currWork, Map<Path, ArrayList<String>> pathToAliases, HashMap<String, Long> aliasToSize) throws SemanticException {
    try {
        // go over all the input paths, and calculate a known total size, known
        // size for each input alias.
        Utilities.getInputSummary(context, currWork, null).getLength();
        // set alias to size mapping, this can be used to determine if one table
        // is chosen as big table, what's the total size of left tables, which
        // are going to be small tables.
        long aliasTotalKnownInputSize = 0L;
        for (Map.Entry<Path, ArrayList<String>> entry : pathToAliases.entrySet()) {
            Path path = entry.getKey();
            List<String> aliasList = entry.getValue();
            ContentSummary cs = context.getCS(path);
            if (cs != null) {
                long size = cs.getLength();
                for (String alias : aliasList) {
                    aliasTotalKnownInputSize += size;
                    Long es = aliasToSize.get(alias);
                    if (es == null) {
                        es = new Long(0);
                    }
                    es += size;
                    aliasToSize.put(alias, es);
                }
            }
        }
        return aliasTotalKnownInputSize;
    } catch (Exception e) {
        e.printStackTrace();
        throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
    }
}

Also used : Path(org.apache.hadoop.fs.Path) ContentSummary(org.apache.hadoop.fs.ContentSummary) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 3 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestUtilities method testGetInputSummaryWithContentSummaryInputFormat.

@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
    final int NUM_PARTITIONS = 5;
    final int BYTES_PER_FILE = 10;
    JobConf jobConf = new JobConf();
    Properties properties = new Properties();
    jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
    ContentSummaryInputFormatTestClass.setContentSummary(new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
    /* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS * 2, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}

Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 4 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hive by apache.

the class TestUtilities method testGetInputSummaryWithMultipleThreads.

@Test
public void testGetInputSummaryWithMultipleThreads() throws IOException {
    final int NUM_PARTITIONS = 5;
    final int BYTES_PER_FILE = 5;
    JobConf jobConf = new JobConf();
    Properties properties = new Properties();
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
    ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
    // Test deprecated mapred.dfsclient.parallelism.max
    jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
    jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
    summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
    assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
    assertEquals(NUM_PARTITIONS, summary.getFileCount());
    assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}

Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 5 with ContentSummary

use of org.apache.hadoop.fs.ContentSummary in project hadoop by apache.

the class TestReplicationPolicy method testConvertLastBlockToUnderConstructionDoesNotCauseSkippedReplication.

@Test(timeout = 60000)
public void testConvertLastBlockToUnderConstructionDoesNotCauseSkippedReplication() throws IOException {
    Namesystem mockNS = mock(Namesystem.class);
    when(mockNS.hasWriteLock()).thenReturn(true);
    BlockManager bm = new BlockManager(mockNS, false, new HdfsConfiguration());
    LowRedundancyBlocks lowRedundancyBlocks = bm.neededReconstruction;
    long blkID1 = ThreadLocalRandom.current().nextLong();
    if (blkID1 < 0) {
        blkID1 *= -1;
    }
    long blkID2 = ThreadLocalRandom.current().nextLong();
    if (blkID2 < 0) {
        blkID2 *= -1;
    }
    BlockInfo block1 = genBlockInfo(blkID1);
    BlockInfo block2 = genBlockInfo(blkID2);
    // Adding QUEUE_LOW_REDUNDANCY block
    lowRedundancyBlocks.add(block1, 0, 0, 1, 1);
    // Adding QUEUE_LOW_REDUNDANCY block
    lowRedundancyBlocks.add(block2, 0, 0, 1, 1);
    List<List<BlockInfo>> chosenBlocks;
    // Choose 1 block from lowRedundancyBlocks. Then it should pick 1 block
    // from QUEUE_VERY_LOW_REDUNDANCY.
    chosenBlocks = lowRedundancyBlocks.chooseLowRedundancyBlocks(1);
    assertTheChosenBlocks(chosenBlocks, 1, 0, 0, 0, 0);
    final BlockInfoContiguous info = new BlockInfoContiguous(block1, (short) 1);
    final BlockCollection mbc = mock(BlockCollection.class);
    when(mbc.getId()).thenReturn(1000L);
    when(mbc.getLastBlock()).thenReturn(info);
    when(mbc.getPreferredBlockSize()).thenReturn(block1.getNumBytes() + 1);
    when(mbc.isUnderConstruction()).thenReturn(true);
    ContentSummary cs = mock(ContentSummary.class);
    when(cs.getLength()).thenReturn((long) 1);
    when(mbc.computeContentSummary(bm.getStoragePolicySuite())).thenReturn(cs);
    info.setBlockCollectionId(1000);
    bm.addBlockCollection(info, mbc);
    DatanodeStorageInfo[] storageAry = { new DatanodeStorageInfo(dataNodes[0], new DatanodeStorage("s1")) };
    info.convertToBlockUnderConstruction(BlockUCState.UNDER_CONSTRUCTION, storageAry);
    DatanodeStorageInfo storage = mock(DatanodeStorageInfo.class);
    DatanodeDescriptor dn = mock(DatanodeDescriptor.class);
    when(dn.isDecommissioned()).thenReturn(true);
    when(storage.getState()).thenReturn(DatanodeStorage.State.NORMAL);
    when(storage.getDatanodeDescriptor()).thenReturn(dn);
    when(storage.removeBlock(any(BlockInfo.class))).thenReturn(true);
    when(storage.addBlock(any(BlockInfo.class))).thenReturn(DatanodeStorageInfo.AddBlockResult.ADDED);
    info.addStorage(storage, info);
    BlockInfo lastBlk = mbc.getLastBlock();
    when(mbc.getLastBlock()).thenReturn(lastBlk, info);
    bm.convertLastBlockToUnderConstruction(mbc, 0L);
    // Choose 1 block from lowRedundancyBlocks. Then it should pick 1 block
    // from QUEUE_VERY_LOW_REDUNDANCY.
    // This block remains and should not be skipped over.
    chosenBlocks = lowRedundancyBlocks.chooseLowRedundancyBlocks(1);
    assertTheChosenBlocks(chosenBlocks, 1, 0, 0, 0, 0);
}

Also used : HdfsConfiguration(org.apache.hadoop.hdfs.HdfsConfiguration) StatefulBlockInfo(org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.StatefulBlockInfo) ContentSummary(org.apache.hadoop.fs.ContentSummary) DatanodeStorage(org.apache.hadoop.hdfs.server.protocol.DatanodeStorage) FSNamesystem(org.apache.hadoop.hdfs.server.namenode.FSNamesystem) Namesystem(org.apache.hadoop.hdfs.server.namenode.Namesystem) List(java.util.List) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Aggregations

ContentSummary (org.apache.hadoop.fs.ContentSummary)84 Path (org.apache.hadoop.fs.Path)60 Test (org.junit.Test)52 FileSystem (org.apache.hadoop.fs.FileSystem)21 IOException (java.io.IOException)13 Configuration (org.apache.hadoop.conf.Configuration)9 ArrayList (java.util.ArrayList)6 URI (java.net.URI)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)5 DSQuotaExceededException (org.apache.hadoop.hdfs.protocol.DSQuotaExceededException)5 QuotaExceededException (org.apache.hadoop.hdfs.protocol.QuotaExceededException)5 WebHdfsFileSystem (org.apache.hadoop.hdfs.web.WebHdfsFileSystem)5 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)5 JobConf (org.apache.hadoop.mapred.JobConf)5 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)5 OutputStream (java.io.OutputStream)4 HttpURLConnection (java.net.HttpURLConnection)4 List (java.util.List)4 Properties (java.util.Properties)4