use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class IndexWhereProcessor method rewriteForIndexes.
/**
* Get a list of Tasks to activate use of tsToIndices.
* Generate the tasks for the index query (where we store results of
* querying the index in a tmp file) inside the IndexHandler
* @param predicate Predicate of query to rewrite
* @param index Index to use for rewrite
* @param pctx
* @param task original task before rewrite
* @param queryContext stores return values
*/
private void rewriteForIndexes(ExprNodeDesc predicate, List<Index> indexes, ParseContext pctx, Task<MapredWork> task, HiveIndexQueryContext queryContext) throws SemanticException {
HiveIndexHandler indexHandler;
// All tsToIndices in the list are of the same type, and therefore can use the
// same handler to generate the index query tasks
Index index = indexes.get(0);
try {
indexHandler = HiveUtils.getIndexHandler(pctx.getConf(), index.getIndexHandlerClass());
} catch (HiveException e) {
LOG.error("Exception while loading IndexHandler: " + index.getIndexHandlerClass(), e);
throw new SemanticException("Failed to load indexHandler: " + index.getIndexHandlerClass(), e);
}
// check the size
try {
ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null);
long inputSize = inputSummary.getLength();
if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) {
queryContext.setQueryTasks(null);
return;
}
} catch (IOException e) {
throw new SemanticException("Failed to get task size", e);
}
// use the IndexHandler to generate the index query
indexHandler.generateIndexQuery(indexes, predicate, pctx, queryContext);
return;
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class AbstractJoinTaskDispatcher method getTotalKnownInputSize.
public long getTotalKnownInputSize(Context context, MapWork currWork, Map<Path, ArrayList<String>> pathToAliases, HashMap<String, Long> aliasToSize) throws SemanticException {
try {
// go over all the input paths, and calculate a known total size, known
// size for each input alias.
Utilities.getInputSummary(context, currWork, null).getLength();
// set alias to size mapping, this can be used to determine if one table
// is chosen as big table, what's the total size of left tables, which
// are going to be small tables.
long aliasTotalKnownInputSize = 0L;
for (Map.Entry<Path, ArrayList<String>> entry : pathToAliases.entrySet()) {
Path path = entry.getKey();
List<String> aliasList = entry.getValue();
ContentSummary cs = context.getCS(path);
if (cs != null) {
long size = cs.getLength();
for (String alias : aliasList) {
aliasTotalKnownInputSize += size;
Long es = aliasToSize.get(alias);
if (es == null) {
es = new Long(0);
}
es += size;
aliasToSize.put(alias, es);
}
}
}
return aliasTotalKnownInputSize;
} catch (Exception e) {
e.printStackTrace();
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
}
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestUtilities method testGetInputSummaryWithContentSummaryInputFormat.
@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
final int NUM_PARTITIONS = 5;
final int BYTES_PER_FILE = 10;
JobConf jobConf = new JobConf();
Properties properties = new Properties();
jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
ContentSummaryInputFormatTestClass.setContentSummary(new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
/* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */
ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS * 2, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestUtilities method testGetInputSummaryWithMultipleThreads.
@Test
public void testGetInputSummaryWithMultipleThreads() throws IOException {
final int NUM_PARTITIONS = 5;
final int BYTES_PER_FILE = 5;
JobConf jobConf = new JobConf();
Properties properties = new Properties();
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
// Test deprecated mapred.dfsclient.parallelism.max
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hadoop by apache.
the class TestReplicationPolicy method testConvertLastBlockToUnderConstructionDoesNotCauseSkippedReplication.
@Test(timeout = 60000)
public void testConvertLastBlockToUnderConstructionDoesNotCauseSkippedReplication() throws IOException {
Namesystem mockNS = mock(Namesystem.class);
when(mockNS.hasWriteLock()).thenReturn(true);
BlockManager bm = new BlockManager(mockNS, false, new HdfsConfiguration());
LowRedundancyBlocks lowRedundancyBlocks = bm.neededReconstruction;
long blkID1 = ThreadLocalRandom.current().nextLong();
if (blkID1 < 0) {
blkID1 *= -1;
}
long blkID2 = ThreadLocalRandom.current().nextLong();
if (blkID2 < 0) {
blkID2 *= -1;
}
BlockInfo block1 = genBlockInfo(blkID1);
BlockInfo block2 = genBlockInfo(blkID2);
// Adding QUEUE_LOW_REDUNDANCY block
lowRedundancyBlocks.add(block1, 0, 0, 1, 1);
// Adding QUEUE_LOW_REDUNDANCY block
lowRedundancyBlocks.add(block2, 0, 0, 1, 1);
List<List<BlockInfo>> chosenBlocks;
// Choose 1 block from lowRedundancyBlocks. Then it should pick 1 block
// from QUEUE_VERY_LOW_REDUNDANCY.
chosenBlocks = lowRedundancyBlocks.chooseLowRedundancyBlocks(1);
assertTheChosenBlocks(chosenBlocks, 1, 0, 0, 0, 0);
final BlockInfoContiguous info = new BlockInfoContiguous(block1, (short) 1);
final BlockCollection mbc = mock(BlockCollection.class);
when(mbc.getId()).thenReturn(1000L);
when(mbc.getLastBlock()).thenReturn(info);
when(mbc.getPreferredBlockSize()).thenReturn(block1.getNumBytes() + 1);
when(mbc.isUnderConstruction()).thenReturn(true);
ContentSummary cs = mock(ContentSummary.class);
when(cs.getLength()).thenReturn((long) 1);
when(mbc.computeContentSummary(bm.getStoragePolicySuite())).thenReturn(cs);
info.setBlockCollectionId(1000);
bm.addBlockCollection(info, mbc);
DatanodeStorageInfo[] storageAry = { new DatanodeStorageInfo(dataNodes[0], new DatanodeStorage("s1")) };
info.convertToBlockUnderConstruction(BlockUCState.UNDER_CONSTRUCTION, storageAry);
DatanodeStorageInfo storage = mock(DatanodeStorageInfo.class);
DatanodeDescriptor dn = mock(DatanodeDescriptor.class);
when(dn.isDecommissioned()).thenReturn(true);
when(storage.getState()).thenReturn(DatanodeStorage.State.NORMAL);
when(storage.getDatanodeDescriptor()).thenReturn(dn);
when(storage.removeBlock(any(BlockInfo.class))).thenReturn(true);
when(storage.addBlock(any(BlockInfo.class))).thenReturn(DatanodeStorageInfo.AddBlockResult.ADDED);
info.addStorage(storage, info);
BlockInfo lastBlk = mbc.getLastBlock();
when(mbc.getLastBlock()).thenReturn(lastBlk, info);
bm.convertLastBlockToUnderConstruction(mbc, 0L);
// Choose 1 block from lowRedundancyBlocks. Then it should pick 1 block
// from QUEUE_VERY_LOW_REDUNDANCY.
// This block remains and should not be skipped over.
chosenBlocks = lowRedundancyBlocks.chooseLowRedundancyBlocks(1);
assertTheChosenBlocks(chosenBlocks, 1, 0, 0, 0, 0);
}
Aggregations