use of java.util.concurrent.Future in project storm by apache.
the class Localizer method updateBlobs.
/**
* This function updates blobs on the supervisor. It uses a separate thread pool and runs
* asynchronously of the download and delete.
*/
public List<LocalizedResource> updateBlobs(List<LocalResource> localResources, String user) throws AuthorizationException, KeyNotFoundException, IOException {
LocalizedResourceSet lrsrcSet = _userRsrc.get(user);
ArrayList<LocalizedResource> results = new ArrayList<>();
ArrayList<Callable<LocalizedResource>> updates = new ArrayList<>();
if (lrsrcSet == null) {
// resource set must have been removed
return results;
}
ClientBlobStore blobstore = null;
try {
blobstore = getClientBlobStore();
for (LocalResource localResource : localResources) {
String key = localResource.getBlobName();
LocalizedResource lrsrc = lrsrcSet.get(key, localResource.shouldUncompress());
if (lrsrc == null) {
LOG.warn("blob requested for update doesn't exist: {}", key);
continue;
} else {
// update it if either the version isn't the latest or if any local blob files are missing
if (!isLocalizedResourceUpToDate(lrsrc, blobstore) || !isLocalizedResourceDownloaded(lrsrc)) {
LOG.debug("updating blob: {}", key);
updates.add(new DownloadBlob(this, _conf, key, new File(lrsrc.getFilePath()), user, lrsrc.isUncompressed(), true));
}
}
}
} finally {
if (blobstore != null) {
blobstore.shutdown();
}
}
try {
List<Future<LocalizedResource>> futures = _updateExecService.invokeAll(updates);
for (Future<LocalizedResource> futureRsrc : futures) {
try {
LocalizedResource lrsrc = futureRsrc.get();
// put the resource just in case it was removed at same time by the cleaner
LocalizedResourceSet newSet = new LocalizedResourceSet(user);
LocalizedResourceSet newlrsrcSet = _userRsrc.putIfAbsent(user, newSet);
if (newlrsrcSet == null) {
newlrsrcSet = newSet;
}
newlrsrcSet.putIfAbsent(lrsrc.getKey(), lrsrc, lrsrc.isUncompressed());
results.add(lrsrc);
} catch (ExecutionException e) {
LOG.error("Error updating blob: ", e);
if (e.getCause() instanceof AuthorizationException) {
throw (AuthorizationException) e.getCause();
}
if (e.getCause() instanceof KeyNotFoundException) {
throw (KeyNotFoundException) e.getCause();
}
}
}
} catch (RejectedExecutionException re) {
LOG.error("Error updating blobs : ", re);
} catch (InterruptedException ie) {
throw new IOException("Interrupted Exception", ie);
}
return results;
}
use of java.util.concurrent.Future in project hive by apache.
the class StatsUtils method getFileSizeForPartitions.
/**
* Find the bytes on disks occupied by list of partitions
* @param conf
* - hive conf
* @param parts
* - partition list
* @return sizes of partitions
*/
public static List<Long> getFileSizeForPartitions(final HiveConf conf, List<Partition> parts) {
LOG.info("Number of partitions : " + parts.size());
ArrayList<Future<Long>> futures = new ArrayList<>();
int threads = Math.max(1, conf.getIntVar(ConfVars.METASTORE_FS_HANDLER_THREADS_COUNT));
final ExecutorService pool = Executors.newFixedThreadPool(threads, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Partitions-Size-%d").build());
final ArrayList<Long> sizes = new ArrayList<>(parts.size());
for (final Partition part : parts) {
final Path path = part.getDataLocation();
futures.add(pool.submit(new Callable<Long>() {
@Override
public Long call() throws Exception {
try {
LOG.debug("Partition path : " + path);
FileSystem fs = path.getFileSystem(conf);
return fs.getContentSummary(path).getLength();
} catch (IOException e) {
return 0L;
}
}
}));
}
try {
for (int i = 0; i < futures.size(); i++) {
sizes.add(i, futures.get(i).get());
}
} catch (InterruptedException | ExecutionException e) {
LOG.warn("Exception in processing files ", e);
} finally {
pool.shutdownNow();
}
return sizes;
}
use of java.util.concurrent.Future in project hive by apache.
the class Utilities method getInputPaths.
/**
* Computes a list of all input paths needed to compute the given MapWork. All aliases
* are considered and a merged list of input paths is returned. If any input path points
* to an empty table or partition a dummy file in the scratch dir is instead created and
* added to the list. This is needed to avoid special casing the operator pipeline for
* these cases.
*
* @param job JobConf used to run the job
* @param work MapWork encapsulating the info about the task
* @param hiveScratchDir The tmp dir used to create dummy files if needed
* @param ctx Context object
* @return List of paths to process for the given MapWork
* @throws Exception
*/
public static List<Path> getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx, boolean skipDummy) throws Exception {
Set<Path> pathsProcessed = new HashSet<Path>();
List<Path> pathsToAdd = new LinkedList<Path>();
// AliasToWork contains all the aliases
for (String alias : work.getAliasToWork().keySet()) {
LOG.info("Processing alias " + alias);
// The alias may not have any path
boolean isEmptyTable = true;
boolean hasLogged = false;
// Note: this copies the list because createDummyFileForEmptyPartition may modify the map.
for (Path file : new LinkedList<Path>(work.getPathToAliases().keySet())) {
List<String> aliases = work.getPathToAliases().get(file);
if (aliases.contains(alias)) {
if (file != null) {
isEmptyTable = false;
} else {
LOG.warn("Found a null path for alias " + alias);
continue;
}
// processed only once
if (pathsProcessed.contains(file)) {
continue;
}
StringInternUtils.internUriStringsInPath(file);
pathsProcessed.add(file);
if (LOG.isDebugEnabled()) {
LOG.debug("Adding input file " + file);
} else if (!hasLogged) {
hasLogged = true;
LOG.info("Adding " + work.getPathToAliases().size() + " inputs; the first input is " + file);
}
pathsToAdd.add(file);
}
}
// rows)
if (isEmptyTable && !skipDummy) {
pathsToAdd.add(createDummyFileForEmptyTable(job, work, hiveScratchDir, alias));
}
}
ExecutorService pool = null;
int numExecutors = getMaxExecutorsForInputListing(job, pathsToAdd.size());
if (numExecutors > 1) {
pool = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Paths-%d").build());
}
List<Path> finalPathsToAdd = new LinkedList<>();
List<Future<Path>> futures = new LinkedList<>();
for (final Path path : pathsToAdd) {
if (pool == null) {
finalPathsToAdd.add(new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy).call());
} else {
futures.add(pool.submit(new GetInputPathsCallable(path, job, work, hiveScratchDir, ctx, skipDummy)));
}
}
if (pool != null) {
for (Future<Path> future : futures) {
finalPathsToAdd.add(future.get());
}
}
return finalPathsToAdd;
}
use of java.util.concurrent.Future in project hive by apache.
the class Utilities method getInputSummary.
/**
* Calculate the total size of input files.
*
* @param ctx
* the hadoop job context
* @param work
* map reduce job plan
* @param filter
* filter to apply to the input paths before calculating size
* @return the summary of all the input paths.
* @throws IOException
*/
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
long[] summary = { 0, 0, 0 };
final Set<Path> pathNeedProcess = new HashSet<>();
// this method will avoid number of threads out of control.
synchronized (INPUT_SUMMARY_LOCK) {
// For each input path, calculate the total size.
for (Path path : work.getPathToAliases().keySet()) {
Path p = path;
if (filter != null && !filter.accept(p)) {
continue;
}
ContentSummary cs = ctx.getCS(path);
if (cs == null) {
if (path == null) {
continue;
}
pathNeedProcess.add(path);
} else {
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
}
}
// Process the case when name node call is needed
final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
ArrayList<Future<?>> results = new ArrayList<Future<?>>();
final ExecutorService executor;
int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
if (numExecutors > 1) {
LOG.info("Using " + numExecutors + " threads for getContentSummary");
executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
} else {
executor = null;
}
HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
@Override
public void interrupt() {
for (Path path : pathNeedProcess) {
try {
path.getFileSystem(ctx.getConf()).close();
} catch (IOException ignore) {
LOG.debug("Failed to close filesystem", ignore);
}
}
if (executor != null) {
executor.shutdownNow();
}
}
});
try {
Configuration conf = ctx.getConf();
JobConf jobConf = new JobConf(conf);
for (Path path : pathNeedProcess) {
final Path p = path;
final String pathStr = path.toString();
// All threads share the same Configuration and JobConf based on the
// assumption that they are thread safe if only read operations are
// executed. It is not stated in Hadoop's javadoc, the sourcce codes
// clearly showed that they made efforts for it and we believe it is
// thread safe. Will revisit this piece of codes if we find the assumption
// is not correct.
final Configuration myConf = conf;
final JobConf myJobConf = jobConf;
final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
Runnable r = new Runnable() {
@Override
public void run() {
try {
Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
if (inputFormatObj instanceof ContentSummaryInputFormat) {
ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
return;
}
String metaTableStorage = null;
if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
}
if (partDesc.getProperties() != null) {
metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
}
HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
if (handler instanceof InputEstimator) {
long total = 0;
TableDesc tableDesc = partDesc.getTableDesc();
InputEstimator estimator = (InputEstimator) handler;
for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
JobConf jobConf = new JobConf(myJobConf);
TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
Utilities.setColumnNameList(jobConf, scanOp, true);
Utilities.setColumnTypeList(jobConf, scanOp, true);
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
}
resultMap.put(pathStr, new ContentSummary(total, -1, -1));
} else {
// todo: should nullify summary for non-native tables,
// not to be selected as a mapjoin target
FileSystem fs = p.getFileSystem(myConf);
resultMap.put(pathStr, fs.getContentSummary(p));
}
} catch (Exception e) {
// We safely ignore this exception for summary data.
// We don't update the cache to protect it from polluting other
// usages. The worst case is that IOException will always be
// retried for another getInputSummary(), which is fine as
// IOException is not considered as a common case.
LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
}
}
};
if (executor == null) {
r.run();
} else {
Future<?> result = executor.submit(r);
results.add(result);
}
}
if (executor != null) {
for (Future<?> result : results) {
boolean executorDone = false;
do {
try {
result.get();
executorDone = true;
} catch (InterruptedException e) {
LOG.info("Interrupted when waiting threads: ", e);
Thread.currentThread().interrupt();
break;
} catch (ExecutionException e) {
throw new IOException(e);
}
} while (!executorDone);
}
executor.shutdown();
}
HiveInterruptUtils.checkInterrupted();
for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
ContentSummary cs = entry.getValue();
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
ctx.addCS(entry.getKey(), cs);
LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
return new ContentSummary(summary[0], summary[1], summary[2]);
} finally {
HiveInterruptUtils.remove(interrup);
}
}
}
use of java.util.concurrent.Future in project hive by apache.
the class Hive method trashFiles.
/**
* Trashes or deletes all files under a directory. Leaves the directory as is.
* @param fs FileSystem to use
* @param statuses fileStatuses of files to be deleted
* @param conf hive configuration
* @return true if deletion successful
* @throws IOException
*/
public static boolean trashFiles(final FileSystem fs, final FileStatus[] statuses, final Configuration conf) throws IOException {
boolean result = true;
if (statuses == null || statuses.length == 0) {
return false;
}
final List<Future<Boolean>> futures = new LinkedList<>();
final ExecutorService pool = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25) > 0 ? Executors.newFixedThreadPool(conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Delete-Thread-%d").build()) : null;
final SessionState parentSession = SessionState.get();
for (final FileStatus status : statuses) {
if (null == pool) {
result &= FileUtils.moveToTrash(fs, status.getPath(), conf);
} else {
futures.add(pool.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
SessionState.setCurrentSessionState(parentSession);
return FileUtils.moveToTrash(fs, status.getPath(), conf);
}
}));
}
}
if (null != pool) {
pool.shutdown();
for (Future<Boolean> future : futures) {
try {
result &= future.get();
} catch (InterruptedException | ExecutionException e) {
LOG.error("Failed to delete: ", e);
pool.shutdownNow();
throw new IOException(e);
}
}
}
return result;
}
Aggregations