use of org.apache.hadoop.fs.PathFilter in project druid by druid-io.
the class HdfsFileTimestampVersionFinder method mostRecentInDir.
private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException {
final PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path path) {
return pattern == null || pattern.matcher(path.getName()).matches();
}
};
long modifiedTime = Long.MIN_VALUE;
URI mostRecentURI = null;
final FileSystem fs = dir.getFileSystem(config);
for (FileStatus status : fs.listStatus(dir, filter)) {
if (status.isFile()) {
final long thisModifiedTime = status.getModificationTime();
if (thisModifiedTime >= modifiedTime) {
modifiedTime = thisModifiedTime;
mostRecentURI = status.getPath().toUri();
}
}
}
return mostRecentURI;
}
use of org.apache.hadoop.fs.PathFilter in project Cloud9 by lintool.
the class TrecWebDocnoMappingBuilder method run.
@Override
public int run(String[] args) throws IOException {
DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
if (options == null) {
return -1;
}
// Temp directory.
String tmpDir = "tmp-" + TrecWebDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);
LOG.info("Tool name: " + TrecWebDocnoMappingBuilder.class.getCanonicalName());
LOG.info(" - input path: " + options.collection);
LOG.info(" - output file: " + options.docnoMapping);
Job job = new Job(getConf(), TrecWebDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection);
FileSystem fs = FileSystem.get(job.getConfiguration());
job.setJarByClass(TrecWebDocnoMappingBuilder.class);
job.setNumReduceTasks(1);
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path path) {
return !path.getName().startsWith("_");
}
};
// Note: Gov2 and Wt10g raw collections are organized into sub-directories.
Path collectionPath = new Path(options.collection);
for (FileStatus status : fs.listStatus(collectionPath, filter)) {
if (status.isDirectory()) {
for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
FileInputFormat.addInputPath(job, s.getPath());
}
} else {
FileInputFormat.addInputPath(job, status.getPath());
}
}
FileOutputFormat.setOutputPath(job, new Path(tmpDir));
FileOutputFormat.setCompressOutput(job, false);
job.setInputFormatClass(options.inputFormat);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// Delete the output directory if it exists already.
fs.delete(new Path(tmpDir), true);
try {
job.waitForCompletion(true);
} catch (Exception e) {
throw new RuntimeException(e);
}
writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
fs.delete(new Path(tmpDir), true);
return 0;
}
use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class ErasureCodeBenchmarkThroughput method cleanUp.
private void cleanUp(int dataSizeMB, boolean isEc) throws IOException {
final String fileName = getFilePath(dataSizeMB, isEc);
Path path = isEc ? new Path(EC_DIR) : new Path(REP_DIR);
FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
@Override
public boolean accept(Path path) {
return path.toString().contains(fileName);
}
});
for (FileStatus fileStatus : fileStatuses) {
fs.delete(fileStatus.getPath(), false);
}
}
use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class FileInputFormat method listStatus.
/** List input directories.
* Subclasses may override to, e.g., select only files matching a regular
* expression.
*
* @param job the job to list input paths for
* @return array of FileStatus objects
* @throws IOException if zero items.
*/
protected FileStatus[] listStatus(JobConf job) throws IOException {
Path[] dirs = getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
// get tokens for all the required FileSystems..
TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);
// Whether we need to recursive look into the directory structure
boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false);
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List<PathFilter> filters = new ArrayList<PathFilter>();
filters.add(hiddenFileFilter);
PathFilter jobFilter = getInputPathFilter(job);
if (jobFilter != null) {
filters.add(jobFilter);
}
PathFilter inputFilter = new MultiPathFilter(filters);
FileStatus[] result;
int numThreads = job.getInt(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.LIST_STATUS_NUM_THREADS, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
StopWatch sw = new StopWatch().start();
if (numThreads == 1) {
List<FileStatus> locatedFiles = singleThreadedListStatus(job, dirs, inputFilter, recursive);
result = locatedFiles.toArray(new FileStatus[locatedFiles.size()]);
} else {
Iterable<FileStatus> locatedFiles = null;
try {
LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job, dirs, recursive, inputFilter, false);
locatedFiles = locatedFileStatusFetcher.getFileStatuses();
} catch (InterruptedException e) {
throw new IOException("Interrupted while getting file statuses");
}
result = Iterables.toArray(locatedFiles, FileStatus.class);
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
}
LOG.info("Total input files to process : " + result.length);
return result;
}
use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class CombineFileInputFormat method createPool.
/**
* Create a new pool and add the filters to it.
* A pathname can satisfy any one of the specified filters.
* A split cannot have files from different pools.
*/
protected void createPool(PathFilter... filters) {
MultiPathFilter multi = new MultiPathFilter();
for (PathFilter f : filters) {
multi.add(f);
}
pools.add(multi);
}
Aggregations