use of org.apache.hadoop.fs.PathFilter in project druid by druid-io.
the class HdfsFileTimestampVersionFinder method mostRecentInDir.
private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException {
final PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path path) {
return pattern == null || pattern.matcher(path.getName()).matches();
}
};
long modifiedTime = Long.MIN_VALUE;
URI mostRecentURI = null;
final FileSystem fs = dir.getFileSystem(config);
for (FileStatus status : fs.listStatus(dir, filter)) {
if (status.isFile()) {
final long thisModifiedTime = status.getModificationTime();
if (thisModifiedTime >= modifiedTime) {
modifiedTime = thisModifiedTime;
mostRecentURI = status.getPath().toUri();
}
}
}
return mostRecentURI;
}
use of org.apache.hadoop.fs.PathFilter in project hbase by apache.
the class FSUtils method getRegionLocalityMappingFromFS.
/**
* This function is to scan the root path of the file system to get either the
* mapping between the region name and its best locality region server or the
* degree of locality of each region on each of the servers having at least
* one block of that region. The output map parameters are both optional.
*
* @param conf
* the configuration to use
* @param desiredTable
* the table you wish to scan locality for
* @param threadPoolSize
* the thread pool size to use
* @param regionToBestLocalityRSMapping
* the map into which to put the best locality mapping or null
* @param regionDegreeLocalityMapping
* the map into which to put the locality degree mapping or null,
* must be a thread-safe implementation
* @throws IOException
* in case of file system errors or interrupts
*/
private static void getRegionLocalityMappingFromFS(final Configuration conf, final String desiredTable, int threadPoolSize, Map<String, String> regionToBestLocalityRSMapping, Map<String, Map<String, Float>> regionDegreeLocalityMapping) throws IOException {
FileSystem fs = FileSystem.get(conf);
Path rootPath = FSUtils.getRootDir(conf);
long startTime = EnvironmentEdgeManager.currentTime();
Path queryPath;
// The table files are in ${hbase.rootdir}/data/<namespace>/<table>/*
if (null == desiredTable) {
queryPath = new Path(new Path(rootPath, HConstants.BASE_NAMESPACE_DIR).toString() + "/*/*/*/");
} else {
queryPath = new Path(FSUtils.getTableDir(rootPath, TableName.valueOf(desiredTable)).toString() + "/*/");
}
// reject all paths that are not appropriate
PathFilter pathFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
// this is the region name; it may get some noise data
if (null == path) {
return false;
}
// no parent?
Path parent = path.getParent();
if (null == parent) {
return false;
}
String regionName = path.getName();
if (null == regionName) {
return false;
}
if (!regionName.toLowerCase(Locale.ROOT).matches("[0-9a-f]+")) {
return false;
}
return true;
}
};
FileStatus[] statusList = fs.globStatus(queryPath, pathFilter);
if (null == statusList) {
return;
} else {
LOG.debug("Query Path: " + queryPath + " ; # list of files: " + statusList.length);
}
// lower the number of threads in case we have very few expected regions
threadPoolSize = Math.min(threadPoolSize, statusList.length);
// run in multiple threads
ThreadPoolExecutor tpe = new ThreadPoolExecutor(threadPoolSize, threadPoolSize, 60, TimeUnit.SECONDS, new ArrayBlockingQueue<>(statusList.length));
try {
// ignore all file status items that are not of interest
for (FileStatus regionStatus : statusList) {
if (null == regionStatus) {
continue;
}
if (!regionStatus.isDirectory()) {
continue;
}
Path regionPath = regionStatus.getPath();
if (null == regionPath) {
continue;
}
tpe.execute(new FSRegionScanner(fs, regionPath, regionToBestLocalityRSMapping, regionDegreeLocalityMapping));
}
} finally {
tpe.shutdown();
int threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 60 * 1000);
try {
// exceptions in the execution of the threads
while (!tpe.awaitTermination(threadWakeFrequency, TimeUnit.MILLISECONDS)) {
// printing out rough estimate, so as to not introduce
// AtomicInteger
LOG.info("Locality checking is underway: { Scanned Regions : " + tpe.getCompletedTaskCount() + "/" + tpe.getTaskCount() + " }");
}
} catch (InterruptedException e) {
throw (InterruptedIOException) new InterruptedIOException().initCause(e);
}
}
long overhead = EnvironmentEdgeManager.currentTime() - startTime;
String overheadMsg = "Scan DFS for locality info takes " + overhead + " ms";
LOG.info(overheadMsg);
}
use of org.apache.hadoop.fs.PathFilter in project hbase by apache.
the class WALSplitter method writeRegionSequenceIdFile.
/**
* Create a file with name as region open sequence id
* @param fs
* @param regiondir
* @param newSeqId
* @param saftyBumper
* @return long new sequence Id value
* @throws IOException
*/
public static long writeRegionSequenceIdFile(final FileSystem fs, final Path regiondir, long newSeqId, long saftyBumper) throws IOException {
Path editsdir = WALSplitter.getRegionDirRecoveredEditsDir(regiondir);
long maxSeqId = 0;
FileStatus[] files = null;
if (fs.exists(editsdir)) {
files = FSUtils.listStatus(fs, editsdir, new PathFilter() {
@Override
public boolean accept(Path p) {
return isSequenceIdFile(p);
}
});
if (files != null) {
for (FileStatus status : files) {
String fileName = status.getPath().getName();
try {
Long tmpSeqId = Long.parseLong(fileName.substring(0, fileName.length() - SEQUENCE_ID_FILE_SUFFIX_LENGTH));
maxSeqId = Math.max(tmpSeqId, maxSeqId);
} catch (NumberFormatException ex) {
LOG.warn("Invalid SeqId File Name=" + fileName);
}
}
}
}
if (maxSeqId > newSeqId) {
newSeqId = maxSeqId;
}
// bump up SeqId
newSeqId += saftyBumper;
// write a new seqId file
Path newSeqIdFile = new Path(editsdir, newSeqId + SEQUENCE_ID_FILE_SUFFIX);
if (newSeqId != maxSeqId) {
try {
if (!fs.createNewFile(newSeqIdFile) && !fs.exists(newSeqIdFile)) {
throw new IOException("Failed to create SeqId file:" + newSeqIdFile);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Wrote region seqId=" + newSeqIdFile + " to file, newSeqId=" + newSeqId + ", maxSeqId=" + maxSeqId);
}
} catch (FileAlreadyExistsException ignored) {
// latest hdfs throws this exception. it's all right if newSeqIdFile already exists
}
}
// remove old ones
if (files != null) {
for (FileStatus status : files) {
if (newSeqIdFile.equals(status.getPath())) {
continue;
}
fs.delete(status.getPath(), false);
}
}
return newSeqId;
}
use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class FileInputFormat method listStatus.
/** List input directories.
* Subclasses may override to, e.g., select only files matching a regular
* expression.
*
* @param job the job to list input paths for
* @return array of FileStatus objects
* @throws IOException if zero items.
*/
protected FileStatus[] listStatus(JobConf job) throws IOException {
Path[] dirs = getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
// get tokens for all the required FileSystems..
TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);
// Whether we need to recursive look into the directory structure
boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false);
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List<PathFilter> filters = new ArrayList<PathFilter>();
filters.add(hiddenFileFilter);
PathFilter jobFilter = getInputPathFilter(job);
if (jobFilter != null) {
filters.add(jobFilter);
}
PathFilter inputFilter = new MultiPathFilter(filters);
FileStatus[] result;
int numThreads = job.getInt(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.LIST_STATUS_NUM_THREADS, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
StopWatch sw = new StopWatch().start();
if (numThreads == 1) {
List<FileStatus> locatedFiles = singleThreadedListStatus(job, dirs, inputFilter, recursive);
result = locatedFiles.toArray(new FileStatus[locatedFiles.size()]);
} else {
Iterable<FileStatus> locatedFiles = null;
try {
LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job, dirs, recursive, inputFilter, false);
locatedFiles = locatedFileStatusFetcher.getFileStatuses();
} catch (InterruptedException e) {
throw new IOException("Interrupted while getting file statuses");
}
result = Iterables.toArray(locatedFiles, FileStatus.class);
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
}
LOG.info("Total input files to process : " + result.length);
return result;
}
use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class CombineFileInputFormat method createPool.
/**
* Create a new pool and add the filters to it.
* A pathname can satisfy any one of the specified filters.
* A split cannot have files from different pools.
*/
protected void createPool(PathFilter... filters) {
MultiPathFilter multi = new MultiPathFilter();
for (PathFilter f : filters) {
multi.add(f);
}
pools.add(multi);
}
Aggregations