Examples with PathFilter - org.apache.hadoop.fs.PathFilter

Example 31 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project druid by druid-io.

the class HdfsFileTimestampVersionFinder method mostRecentInDir.

private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException {
    final PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return pattern == null || pattern.matcher(path.getName()).matches();
        }
    };
    long modifiedTime = Long.MIN_VALUE;
    URI mostRecentURI = null;
    final FileSystem fs = dir.getFileSystem(config);
    for (FileStatus status : fs.listStatus(dir, filter)) {
        if (status.isFile()) {
            final long thisModifiedTime = status.getModificationTime();
            if (thisModifiedTime >= modifiedTime) {
                modifiedTime = thisModifiedTime;
                mostRecentURI = status.getPath().toUri();
            }
        }
    }
    return mostRecentURI;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) URI(java.net.URI)

Example 32 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project Cloud9 by lintool.

the class TrecWebDocnoMappingBuilder method run.

@Override
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    if (options == null) {
        return -1;
    }
    // Temp directory.
    String tmpDir = "tmp-" + TrecWebDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);
    LOG.info("Tool name: " + TrecWebDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);
    Job job = new Job(getConf(), TrecWebDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection);
    FileSystem fs = FileSystem.get(job.getConfiguration());
    job.setJarByClass(TrecWebDocnoMappingBuilder.class);
    job.setNumReduceTasks(1);
    PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };
    // Note: Gov2 and Wt10g raw collections are organized into sub-directories.
    Path collectionPath = new Path(options.collection);
    for (FileStatus status : fs.listStatus(collectionPath, filter)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormatClass(options.inputFormat);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);
    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
    fs.delete(new Path(tmpDir), true);
    return 0;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) DocnoMapping(edu.umd.cloud9.collection.DocnoMapping) Job(org.apache.hadoop.mapreduce.Job) IOException(java.io.IOException)

Example 33 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class ErasureCodeBenchmarkThroughput method cleanUp.

private void cleanUp(int dataSizeMB, boolean isEc) throws IOException {
    final String fileName = getFilePath(dataSizeMB, isEc);
    Path path = isEc ? new Path(EC_DIR) : new Path(REP_DIR);
    FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return path.toString().contains(fileName);
        }
    });
    for (FileStatus fileStatus : fileStatuses) {
        fs.delete(fileStatus.getPath(), false);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus)

Example 34 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class FileInputFormat method listStatus.

/** List input directories.
   * Subclasses may override to, e.g., select only files matching a regular
   * expression. 
   * 
   * @param job the job to list input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);
    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false);
    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    FileStatus[] result;
    int numThreads = job.getInt(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.LIST_STATUS_NUM_THREADS, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
    StopWatch sw = new StopWatch().start();
    if (numThreads == 1) {
        List<FileStatus> locatedFiles = singleThreadedListStatus(job, dirs, inputFilter, recursive);
        result = locatedFiles.toArray(new FileStatus[locatedFiles.size()]);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job, dirs, recursive, inputFilter, false);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Iterables.toArray(locatedFiles, FileStatus.class);
    }
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
    }
    LOG.info("Total input files to process : " + result.length);
    return result;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) StopWatch(org.apache.hadoop.util.StopWatch)

Example 35 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class CombineFileInputFormat method createPool.

/**
   * Create a new pool and add the filters to it. 
   * A pathname can satisfy any one of the specified filters.
   * A split cannot have files from different pools.
   */
protected void createPool(PathFilter... filters) {
    MultiPathFilter multi = new MultiPathFilter();
    for (PathFilter f : filters) {
        multi.add(f);
    }
    pools.add(multi);
}

Also used : PathFilter(org.apache.hadoop.fs.PathFilter)

Aggregations

PathFilter (org.apache.hadoop.fs.PathFilter)43 Path (org.apache.hadoop.fs.Path)41 FileStatus (org.apache.hadoop.fs.FileStatus)37 FileSystem (org.apache.hadoop.fs.FileSystem)18 IOException (java.io.IOException)16 ArrayList (java.util.ArrayList)11 Test (org.junit.Test)8 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)5 InterruptedIOException (java.io.InterruptedIOException)4 Configuration (org.apache.hadoop.conf.Configuration)3 Admin (org.apache.hadoop.hbase.client.Admin)3 Table (org.apache.hadoop.hbase.client.Table)3 HRegion (org.apache.hadoop.hbase.regionserver.HRegion)3 ZooKeeperWatcher (org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher)3 URI (java.net.URI)2 HashMap (java.util.HashMap)2 ExecutionException (java.util.concurrent.ExecutionException)2 Exchange (org.apache.camel.Exchange)2 Message (org.apache.camel.Message)2 DefaultMessage (org.apache.camel.impl.DefaultMessage)2