use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestSymlinkTextInputFormat method testAccuracy2.
/**
* Scenario: Empty input directory, i.e. no symlink file.
*
* Expected: Should return empty result set without any exception.
*/
public void testAccuracy2() throws IOException {
fileSystem.mkdirs(symlinkDir);
FileInputFormat.setInputPaths(job, symlinkDir);
SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
assertEquals(0, cs.getLength());
assertEquals(0, cs.getFileCount());
assertEquals(0, cs.getDirectoryCount());
InputSplit[] splits = inputFormat.getSplits(job, 2);
log.info("Number of splits: " + splits.length);
// Read all values.
List<String> received = new ArrayList<String>();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter);
LongWritable key = reader.createKey();
Text value = reader.createValue();
while (reader.next(key, value)) {
received.add(value.toString());
}
reader.close();
}
List<String> expected = new ArrayList<String>();
assertEquals(expected, received);
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestUtilities method testGetInputSummaryWithContentSummaryInputFormat.
@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
final int NUM_PARTITIONS = 5;
final int BYTES_PER_FILE = 10;
JobConf jobConf = new JobConf();
Properties properties = new Properties();
jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
ContentSummaryInputFormatTestClass.setContentSummary(new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
/* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */
ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS * 2, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class TestUtilities method testGetInputSummaryWithMultipleThreads.
@Test
public void testGetInputSummaryWithMultipleThreads() throws IOException {
final int NUM_PARTITIONS = 5;
final int BYTES_PER_FILE = 5;
JobConf jobConf = new JobConf();
Properties properties = new Properties();
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
// Test deprecated mapred.dfsclient.parallelism.max
jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
assertEquals(NUM_PARTITIONS, summary.getFileCount());
assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class Utilities method getInputSummary.
/**
* Calculate the total size of input files.
*
* @param ctx
* the hadoop job context
* @param work
* map reduce job plan
* @param filter
* filter to apply to the input paths before calculating size
* @return the summary of all the input paths.
* @throws IOException
*/
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
long[] summary = { 0, 0, 0 };
final Set<Path> pathNeedProcess = new HashSet<>();
// this method will avoid number of threads out of control.
synchronized (INPUT_SUMMARY_LOCK) {
// For each input path, calculate the total size.
for (Path path : work.getPathToAliases().keySet()) {
Path p = path;
if (filter != null && !filter.accept(p)) {
continue;
}
ContentSummary cs = ctx.getCS(path);
if (cs == null) {
if (path == null) {
continue;
}
pathNeedProcess.add(path);
} else {
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
}
}
// Process the case when name node call is needed
final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
ArrayList<Future<?>> results = new ArrayList<Future<?>>();
final ExecutorService executor;
int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
if (numExecutors > 1) {
LOG.info("Using " + numExecutors + " threads for getContentSummary");
executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
} else {
executor = null;
}
HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
@Override
public void interrupt() {
for (Path path : pathNeedProcess) {
try {
path.getFileSystem(ctx.getConf()).close();
} catch (IOException ignore) {
LOG.debug("Failed to close filesystem", ignore);
}
}
if (executor != null) {
executor.shutdownNow();
}
}
});
try {
Configuration conf = ctx.getConf();
JobConf jobConf = new JobConf(conf);
for (Path path : pathNeedProcess) {
final Path p = path;
final String pathStr = path.toString();
// All threads share the same Configuration and JobConf based on the
// assumption that they are thread safe if only read operations are
// executed. It is not stated in Hadoop's javadoc, the sourcce codes
// clearly showed that they made efforts for it and we believe it is
// thread safe. Will revisit this piece of codes if we find the assumption
// is not correct.
final Configuration myConf = conf;
final JobConf myJobConf = jobConf;
final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
Runnable r = new Runnable() {
@Override
public void run() {
try {
Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
if (inputFormatObj instanceof ContentSummaryInputFormat) {
ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
return;
}
String metaTableStorage = null;
if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
}
if (partDesc.getProperties() != null) {
metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
}
HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
if (handler instanceof InputEstimator) {
long total = 0;
TableDesc tableDesc = partDesc.getTableDesc();
InputEstimator estimator = (InputEstimator) handler;
for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
JobConf jobConf = new JobConf(myJobConf);
TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
Utilities.setColumnNameList(jobConf, scanOp, true);
Utilities.setColumnTypeList(jobConf, scanOp, true);
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
}
resultMap.put(pathStr, new ContentSummary(total, -1, -1));
} else {
// todo: should nullify summary for non-native tables,
// not to be selected as a mapjoin target
FileSystem fs = p.getFileSystem(myConf);
resultMap.put(pathStr, fs.getContentSummary(p));
}
} catch (Exception e) {
// We safely ignore this exception for summary data.
// We don't update the cache to protect it from polluting other
// usages. The worst case is that IOException will always be
// retried for another getInputSummary(), which is fine as
// IOException is not considered as a common case.
LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
}
}
};
if (executor == null) {
r.run();
} else {
Future<?> result = executor.submit(r);
results.add(result);
}
}
if (executor != null) {
for (Future<?> result : results) {
boolean executorDone = false;
do {
try {
result.get();
executorDone = true;
} catch (InterruptedException e) {
LOG.info("Interrupted when waiting threads: ", e);
Thread.currentThread().interrupt();
break;
} catch (ExecutionException e) {
throw new IOException(e);
}
} while (!executorDone);
}
executor.shutdown();
}
HiveInterruptUtils.checkInterrupted();
for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
ContentSummary cs = entry.getValue();
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
ctx.addCS(entry.getKey(), cs);
LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
return new ContentSummary(summary[0], summary[1], summary[2]);
} finally {
HiveInterruptUtils.remove(interrup);
}
}
}
use of org.apache.hadoop.fs.ContentSummary in project hive by apache.
the class CorrelationOptimizer method findPossibleAutoConvertedJoinOperators.
private void findPossibleAutoConvertedJoinOperators() throws SemanticException {
// based on hive.auto.convert.join.noconditionaltask.size.
for (JoinOperator joinOp : pCtx.getJoinOps()) {
boolean isAbleToGuess = true;
boolean mayConvert = false;
// Get total size and individual alias's size
long aliasTotalKnownInputSize = 0;
Map<String, Long> aliasToSize = new HashMap<String, Long>();
Map<Integer, Set<String>> posToAliases = new HashMap<Integer, Set<String>>();
for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
Operator<? extends OperatorDesc> op = joinOp.getParentOperators().get(pos);
Set<TableScanOperator> topOps = CorrelationUtilities.findTableScanOperators(op);
if (topOps.isEmpty()) {
isAbleToGuess = false;
break;
}
Set<String> aliases = new LinkedHashSet<String>();
for (TableScanOperator tsop : topOps) {
Table table = tsop.getConf().getTableMetadata();
if (table == null) {
// table should not be null.
throw new SemanticException("The table of " + tsop.getName() + " " + tsop.getIdentifier() + " is null, which is not expected.");
}
String alias = tsop.getConf().getAlias();
aliases.add(alias);
Path p = table.getPath();
ContentSummary resultCs = null;
try {
FileSystem fs = table.getPath().getFileSystem(pCtx.getConf());
resultCs = fs.getContentSummary(p);
} catch (IOException e) {
LOG.warn("Encounter a error while querying content summary of table " + table.getCompleteName() + " from FileSystem. " + "Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
}
if (resultCs == null) {
isAbleToGuess = false;
break;
}
long size = resultCs.getLength();
aliasTotalKnownInputSize += size;
Long es = aliasToSize.get(alias);
if (es == null) {
es = new Long(0);
}
es += size;
aliasToSize.put(alias, es);
}
posToAliases.put(pos, aliases);
}
if (!isAbleToGuess) {
LOG.info("Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
continue;
}
JoinDesc joinDesc = joinOp.getConf();
Byte[] order = joinDesc.getTagOrder();
int numAliases = order.length;
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
if (bigTableCandidates.isEmpty()) {
continue;
}
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(pCtx.getConf(), HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int i = 0; i < numAliases; i++) {
// this table cannot be big table
if (!bigTableCandidates.contains(i)) {
continue;
}
Set<String> aliases = posToAliases.get(i);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (!CommonJoinTaskDispatcher.cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
mayConvert = true;
}
}
if (mayConvert) {
LOG.info(joinOp.getName() + " " + joinOp.getIdentifier() + " may be converted to MapJoin by CommonJoinResolver");
skipedJoinOperators.add(joinOp);
}
}
}
Aggregations