use of java.util.concurrent.Future in project hive by apache.
the class Hive method copyFiles.
private static void copyFiles(final HiveConf conf, final FileSystem destFs, FileStatus[] srcs, final FileSystem srcFs, final Path destf, final boolean isSrcLocal, final List<Path> newFiles) throws HiveException {
final HdfsUtils.HadoopFileStatus fullDestStatus;
try {
fullDestStatus = new HdfsUtils.HadoopFileStatus(conf, destFs, destf);
} catch (IOException e1) {
throw new HiveException(e1);
}
if (!fullDestStatus.getFileStatus().isDirectory()) {
throw new HiveException(destf + " is not a directory.");
}
final boolean inheritPerms = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_WAREHOUSE_SUBDIR_INHERIT_PERMS);
final List<Future<ObjectPair<Path, Path>>> futures = new LinkedList<>();
final ExecutorService pool = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25) > 0 ? Executors.newFixedThreadPool(conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Move-Thread-%d").build()) : null;
for (FileStatus src : srcs) {
FileStatus[] files;
if (src.isDirectory()) {
try {
files = srcFs.listStatus(src.getPath(), FileUtils.HIDDEN_FILES_PATH_FILTER);
} catch (IOException e) {
pool.shutdownNow();
throw new HiveException(e);
}
} else {
files = new FileStatus[] { src };
}
final SessionState parentSession = SessionState.get();
for (final FileStatus srcFile : files) {
final Path srcP = srcFile.getPath();
final boolean needToCopy = needToCopy(srcP, destf, srcFs, destFs);
final boolean isRenameAllowed = !needToCopy && !isSrcLocal;
// If we do a rename for a non-local file, we will be transfering the original
// file permissions from source to the destination. Else, in case of mvFile() where we
// copy from source to destination, we will inherit the destination's parent group ownership.
final String srcGroup = isRenameAllowed ? srcFile.getGroup() : fullDestStatus.getFileStatus().getGroup();
if (null == pool) {
try {
Path destPath = mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isRenameAllowed);
if (null != newFiles) {
newFiles.add(destPath);
}
} catch (IOException ioe) {
LOG.error("Failed to move: {}", ioe.getMessage());
throw new HiveException(ioe.getCause());
}
} else {
futures.add(pool.submit(new Callable<ObjectPair<Path, Path>>() {
@Override
public ObjectPair<Path, Path> call() throws Exception {
SessionState.setCurrentSessionState(parentSession);
Path destPath = mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isRenameAllowed);
if (inheritPerms) {
HdfsUtils.setFullFileStatus(conf, fullDestStatus, srcGroup, destFs, destPath, false);
}
if (null != newFiles) {
newFiles.add(destPath);
}
return ObjectPair.create(srcP, destPath);
}
}));
}
}
}
if (null == pool) {
if (inheritPerms) {
HdfsUtils.setFullFileStatus(conf, fullDestStatus, null, destFs, destf, true);
}
} else {
pool.shutdown();
for (Future<ObjectPair<Path, Path>> future : futures) {
try {
ObjectPair<Path, Path> pair = future.get();
LOG.debug("Moved src: {}", pair.getFirst().toString(), ", to dest: {}", pair.getSecond().toString());
} catch (Exception e) {
LOG.error("Failed to move: {}", e.getMessage());
pool.shutdownNow();
throw new HiveException(e.getCause());
}
}
}
}
use of java.util.concurrent.Future in project hive by apache.
the class StatsTask method aggregateStats.
private int aggregateStats(Hive db) {
StatsAggregator statsAggregator = null;
int ret = 0;
StatsCollectionContext scc = null;
EnvironmentContext environmentContext = null;
try {
// Stats setup:
final Warehouse wh = new Warehouse(conf);
if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) {
try {
scc = getContext();
statsAggregator = createStatsAggregator(scc, conf);
} catch (HiveException e) {
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw e;
}
console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString()));
}
}
List<Partition> partitions = getPartitionsList(db);
boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC);
String tableFullName = table.getDbName() + "." + table.getTableName();
if (partitions == null) {
org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
Map<String, String> parameters = tTable.getParameters();
// acidTable will not have accurate stats unless it is set through analyze command.
if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
} else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
}
// non-partitioned tables:
if (!existStats(parameters) && atomic) {
return 0;
}
// For eg. if a file is being loaded, the old number of rows are not valid
if (work.isClearAggregatorStats()) {
// we choose to keep the invalid stats and only change the setting.
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
}
updateQuickStats(wh, parameters, tTable.getSd());
if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
if (statsAggregator != null) {
String prefix = getAggregationPrefix(table, null);
updateStats(statsAggregator, parameters, prefix, atomic);
}
// write table stats to metastore
if (!getWork().getNoStatsAggregator()) {
environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
}
}
getHive().alterTable(tableFullName, new Table(tTable), environmentContext);
if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
}
LOG.info("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
} else {
// Partitioned table:
// Need to get the old stats of the partition
// and update the table stats based on the old and new stats.
List<Partition> updates = new ArrayList<Partition>();
//Get the file status up-front for all partitions. Beneficial in cases of blob storage systems
final Map<String, FileStatus[]> fileStatusMap = new ConcurrentHashMap<String, FileStatus[]>();
int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1);
// In case thread count is set to 0, use single thread.
poolSize = Math.max(poolSize, 1);
final ExecutorService pool = Executors.newFixedThreadPool(poolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("stats-updater-thread-%d").build());
final List<Future<Void>> futures = Lists.newLinkedList();
LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize);
try {
for (final Partition partn : partitions) {
final String partitionName = partn.getName();
final org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
Map<String, String> parameters = tPart.getParameters();
if (!existStats(parameters) && atomic) {
continue;
}
futures.add(pool.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
FileStatus[] partfileStatus = wh.getFileStatusesForSD(tPart.getSd());
fileStatusMap.put(partitionName, partfileStatus);
return null;
}
}));
}
pool.shutdown();
for (Future<Void> future : futures) {
future.get();
}
} catch (InterruptedException e) {
LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks");
//cancel other futures
for (Future future : futures) {
future.cancel(true);
}
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = 1;
}
} finally {
if (pool != null) {
pool.shutdownNow();
}
LOG.debug("Finished getting file stats of all partitions");
}
for (Partition partn : partitions) {
//
// get the old partition stats
//
org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
Map<String, String> parameters = tPart.getParameters();
if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
} else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
}
//only when the stats exist, it is added to fileStatusMap
if (!fileStatusMap.containsKey(partn.getName())) {
continue;
}
// For eg. if a file is being loaded, the old number of rows are not valid
if (work.isClearAggregatorStats()) {
// we choose to keep the invalid stats and only change the setting.
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
}
updateQuickStats(parameters, fileStatusMap.get(partn.getName()));
if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
if (statsAggregator != null) {
String prefix = getAggregationPrefix(table, partn);
updateStats(statsAggregator, parameters, prefix, atomic);
}
if (!getWork().getNoStatsAggregator()) {
environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
}
}
updates.add(new Partition(table, tPart));
if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
console.printInfo("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
}
LOG.info("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
}
if (!updates.isEmpty()) {
db.alterPartitions(tableFullName, updates, environmentContext);
}
}
} catch (Exception e) {
console.printInfo("[Warning] could not update stats.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = 1;
}
} finally {
if (statsAggregator != null) {
statsAggregator.closeConnection(scc);
}
}
// anything else indicates failure
return ret;
}
use of java.util.concurrent.Future in project hive by apache.
the class OrcInputFormat method generateSplitsInfo.
static List<OrcSplit> generateSplitsInfo(Configuration conf, Context context) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("ORC pushdown predicate: " + context.sarg);
}
boolean useFileIdsConfig = HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_INCLUDE_FILE_ID_IN_SPLITS);
// Sharing this state assumes splits will succeed or fail to get it together (same FS).
// We also start with null and only set it to true on the first call, so we would only do
// the global-disable thing on the first failure w/the API error, not any random failure.
Ref<Boolean> useFileIds = Ref.from(useFileIdsConfig ? null : false);
boolean allowSyntheticFileIds = useFileIdsConfig && HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_ALLOW_SYNTHETIC_FILE_ID_IN_SPLITS);
List<OrcSplit> splits = Lists.newArrayList();
List<Future<AcidDirInfo>> pathFutures = Lists.newArrayList();
List<Future<Void>> strategyFutures = Lists.newArrayList();
final List<Future<List<OrcSplit>>> splitFutures = Lists.newArrayList();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
// multi-threaded file statuses and split strategy
Path[] paths = getInputPaths(conf);
CompletionService<AcidDirInfo> ecs = new ExecutorCompletionService<>(Context.threadPool);
for (Path dir : paths) {
FileSystem fs = dir.getFileSystem(conf);
FileGenerator fileGenerator = new FileGenerator(context, fs, dir, useFileIds, ugi);
pathFutures.add(ecs.submit(fileGenerator));
}
boolean isTransactionalTableScan = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
boolean isSchemaEvolution = HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION);
TypeDescription readerSchema = OrcInputFormat.getDesiredRowTypeDescr(conf, isTransactionalTableScan, Integer.MAX_VALUE);
List<OrcProto.Type> readerTypes = null;
if (readerSchema != null) {
readerTypes = OrcUtils.getOrcTypes(readerSchema);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Generate splits schema evolution property " + isSchemaEvolution + " reader schema " + (readerSchema == null ? "NULL" : readerSchema.toString()) + " transactional scan property " + isTransactionalTableScan);
}
// complete path futures and schedule split generation
try {
CombinedCtx combinedCtx = (context.splitStrategyBatchMs > 0) ? new CombinedCtx() : null;
long maxWaitUs = context.splitStrategyBatchMs * 1000000;
int resultsLeft = paths.length;
while (resultsLeft > 0) {
AcidDirInfo adi = null;
if (combinedCtx != null && combinedCtx.combined != null) {
long waitTimeUs = combinedCtx.combineStartUs + maxWaitUs - System.nanoTime();
if (waitTimeUs >= 0) {
Future<AcidDirInfo> f = ecs.poll(waitTimeUs, TimeUnit.NANOSECONDS);
adi = (f == null) ? null : f.get();
}
} else {
adi = ecs.take().get();
}
if (adi == null) {
// We were combining SS-es and the time has expired.
assert combinedCtx.combined != null;
scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
combinedCtx.combined = null;
continue;
}
// We have received a new directory information, make split strategies.
--resultsLeft;
// The reason why we can get a list of split strategies here is because for ACID split-update
// case when we have a mix of original base files & insert deltas, we will produce two
// independent split strategies for them. There is a global flag 'isOriginal' that is set
// on a per split strategy basis and it has to be same for all the files in that strategy.
List<SplitStrategy<?>> splitStrategies = determineSplitStrategies(combinedCtx, context, adi.fs, adi.splitPath, adi.acidInfo, adi.baseFiles, adi.parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
for (SplitStrategy<?> splitStrategy : splitStrategies) {
if (isDebugEnabled) {
LOG.debug("Split strategy: {}", splitStrategy);
}
// This works purely by magic, because we know which strategy produces which type.
if (splitStrategy instanceof ETLSplitStrategy) {
scheduleSplits((ETLSplitStrategy) splitStrategy, context, splitFutures, strategyFutures, splits);
} else {
@SuppressWarnings("unchecked") List<OrcSplit> readySplits = (List<OrcSplit>) splitStrategy.getSplits();
splits.addAll(readySplits);
}
}
}
// Run the last combined strategy, if any.
if (combinedCtx != null && combinedCtx.combined != null) {
scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
combinedCtx.combined = null;
}
// complete split futures
for (Future<Void> ssFuture : strategyFutures) {
// Make sure we get exceptions strategies might have thrown.
ssFuture.get();
}
// All the split strategies are done, so it must be safe to access splitFutures.
for (Future<List<OrcSplit>> splitFuture : splitFutures) {
splits.addAll(splitFuture.get());
}
} catch (Exception e) {
cancelFutures(pathFutures);
cancelFutures(strategyFutures);
cancelFutures(splitFutures);
throw new RuntimeException("ORC split generation failed with exception: " + e.getMessage(), e);
}
if (context.cacheStripeDetails) {
LOG.info("FooterCacheHitRatio: " + context.cacheHitCounter.get() + "/" + context.numFilesCounter.get());
}
if (isDebugEnabled) {
for (OrcSplit split : splits) {
LOG.debug(split + " projected_columns_uncompressed_size: " + split.getColumnarProjectionSize());
}
}
return splits;
}
use of java.util.concurrent.Future in project hive by apache.
the class HiveMetaStoreChecker method checkPartitionDirs.
private void checkPartitionDirs(final ExecutorService executor, final Path basePath, final Set<Path> result, final FileSystem fs, final int maxDepth) throws HiveException {
try {
Queue<Future<Path>> futures = new LinkedList<Future<Path>>();
ConcurrentLinkedQueue<PathDepthInfo> nextLevel = new ConcurrentLinkedQueue<>();
nextLevel.add(new PathDepthInfo(basePath, 0));
//not done right
while (!nextLevel.isEmpty()) {
ConcurrentLinkedQueue<PathDepthInfo> tempQueue = new ConcurrentLinkedQueue<>();
//process each level in parallel
while (!nextLevel.isEmpty()) {
futures.add(executor.submit(new PathDepthInfoCallable(nextLevel.poll(), maxDepth, fs, tempQueue)));
}
while (!futures.isEmpty()) {
Path p = futures.poll().get();
if (p != null) {
result.add(p);
}
}
//update the nextlevel with newly discovered sub-directories from the above
nextLevel = tempQueue;
}
} catch (InterruptedException | ExecutionException e) {
LOG.error(e.getMessage());
executor.shutdownNow();
throw new HiveException(e.getCause());
}
}
use of java.util.concurrent.Future in project hive by apache.
the class ConcurrentJobRequestsTestBase method executeJobOperations.
public void executeJobOperations(JobRunnable jobRunnable, int threadCount, boolean killThreads, boolean interruptThreads) throws IOException, InterruptedException, QueueException, NotAuthorizedException {
started = false;
ExecutorService executorService = new ThreadPoolExecutor(threadCount, threadCount, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>());
;
ArrayList<Future<?>> futures = new ArrayList<Future<?>>();
for (int i = 0; i < threadCount; i++) {
futures.add(executorService.submit(jobRunnable));
}
waitForAllThreadsToStart(jobRunnable, threadCount);
LOG.info("Started all threads ");
if (killThreads) {
executorService.shutdownNow();
} else {
if (interruptThreads) {
for (Future<?> future : futures) {
LOG.info("Cancelling the thread");
future.cancel(true);
}
}
executorService.shutdown();
}
/*
* For both graceful or forceful shutdown, wait for tasks to terminate such that
* appropriate exceptions are raised and stored in JobRunnable.exception.
*/
if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
LOG.info("Force Shutting down the pool\n");
if (!killThreads) {
/*
* killThreads option has already done force shutdown. No need to do again.
*/
executorService.shutdownNow();
}
}
}
Aggregations