Examples with PathFilter - org.apache.hadoop.fs.PathFilter

Example 66 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project druid by druid-io.

the class HdfsFileTimestampVersionFinder method mostRecentInDir.

private URI mostRecentInDir(final Path dir, final Pattern pattern) throws IOException {
    final PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return pattern == null || pattern.matcher(path.getName()).matches();
        }
    };
    long modifiedTime = Long.MIN_VALUE;
    URI mostRecentURI = null;
    final FileSystem fs = dir.getFileSystem(config);
    for (FileStatus status : fs.listStatus(dir, filter)) {
        if (status.isFile()) {
            final long thisModifiedTime = status.getModificationTime();
            if (thisModifiedTime >= modifiedTime) {
                modifiedTime = thisModifiedTime;
                mostRecentURI = status.getPath().toUri();
            }
        }
    }
    return mostRecentURI;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) URI(java.net.URI)

Example 67 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hbase by apache.

the class FSUtils method getRegionLocalityMappingFromFS.

/**
   * This function is to scan the root path of the file system to get either the
   * mapping between the region name and its best locality region server or the
   * degree of locality of each region on each of the servers having at least
   * one block of that region. The output map parameters are both optional.
   *
   * @param conf
   *          the configuration to use
   * @param desiredTable
   *          the table you wish to scan locality for
   * @param threadPoolSize
   *          the thread pool size to use
   * @param regionToBestLocalityRSMapping
   *          the map into which to put the best locality mapping or null
   * @param regionDegreeLocalityMapping
   *          the map into which to put the locality degree mapping or null,
   *          must be a thread-safe implementation
   * @throws IOException
   *           in case of file system errors or interrupts
   */
private static void getRegionLocalityMappingFromFS(final Configuration conf, final String desiredTable, int threadPoolSize, Map<String, String> regionToBestLocalityRSMapping, Map<String, Map<String, Float>> regionDegreeLocalityMapping) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    Path rootPath = FSUtils.getRootDir(conf);
    long startTime = EnvironmentEdgeManager.currentTime();
    Path queryPath;
    // The table files are in ${hbase.rootdir}/data/<namespace>/<table>/*
    if (null == desiredTable) {
        queryPath = new Path(new Path(rootPath, HConstants.BASE_NAMESPACE_DIR).toString() + "/*/*/*/");
    } else {
        queryPath = new Path(FSUtils.getTableDir(rootPath, TableName.valueOf(desiredTable)).toString() + "/*/");
    }
    // reject all paths that are not appropriate
    PathFilter pathFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            // this is the region name; it may get some noise data
            if (null == path) {
                return false;
            }
            // no parent?
            Path parent = path.getParent();
            if (null == parent) {
                return false;
            }
            String regionName = path.getName();
            if (null == regionName) {
                return false;
            }
            if (!regionName.toLowerCase(Locale.ROOT).matches("[0-9a-f]+")) {
                return false;
            }
            return true;
        }
    };
    FileStatus[] statusList = fs.globStatus(queryPath, pathFilter);
    if (null == statusList) {
        return;
    } else {
        LOG.debug("Query Path: " + queryPath + " ; # list of files: " + statusList.length);
    }
    // lower the number of threads in case we have very few expected regions
    threadPoolSize = Math.min(threadPoolSize, statusList.length);
    // run in multiple threads
    ThreadPoolExecutor tpe = new ThreadPoolExecutor(threadPoolSize, threadPoolSize, 60, TimeUnit.SECONDS, new ArrayBlockingQueue<>(statusList.length));
    try {
        // ignore all file status items that are not of interest
        for (FileStatus regionStatus : statusList) {
            if (null == regionStatus) {
                continue;
            }
            if (!regionStatus.isDirectory()) {
                continue;
            }
            Path regionPath = regionStatus.getPath();
            if (null == regionPath) {
                continue;
            }
            tpe.execute(new FSRegionScanner(fs, regionPath, regionToBestLocalityRSMapping, regionDegreeLocalityMapping));
        }
    } finally {
        tpe.shutdown();
        int threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 60 * 1000);
        try {
            // exceptions in the execution of the threads
            while (!tpe.awaitTermination(threadWakeFrequency, TimeUnit.MILLISECONDS)) {
                // printing out rough estimate, so as to not introduce
                // AtomicInteger
                LOG.info("Locality checking is underway: { Scanned Regions : " + tpe.getCompletedTaskCount() + "/" + tpe.getTaskCount() + " }");
            }
        } catch (InterruptedException e) {
            throw (InterruptedIOException) new InterruptedIOException().initCause(e);
        }
    }
    long overhead = EnvironmentEdgeManager.currentTime() - startTime;
    String overheadMsg = "Scan DFS for locality info takes " + overhead + " ms";
    LOG.info(overheadMsg);
}

Also used : Path(org.apache.hadoop.fs.Path) InterruptedIOException(java.io.InterruptedIOException) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) HFileSystem(org.apache.hadoop.hbase.fs.HFileSystem) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor)

Example 68 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hbase by apache.

the class WALSplitter method writeRegionSequenceIdFile.

/**
   * Create a file with name as region open sequence id
   * @param fs
   * @param regiondir
   * @param newSeqId
   * @param saftyBumper
   * @return long new sequence Id value
   * @throws IOException
   */
public static long writeRegionSequenceIdFile(final FileSystem fs, final Path regiondir, long newSeqId, long saftyBumper) throws IOException {
    Path editsdir = WALSplitter.getRegionDirRecoveredEditsDir(regiondir);
    long maxSeqId = 0;
    FileStatus[] files = null;
    if (fs.exists(editsdir)) {
        files = FSUtils.listStatus(fs, editsdir, new PathFilter() {

            @Override
            public boolean accept(Path p) {
                return isSequenceIdFile(p);
            }
        });
        if (files != null) {
            for (FileStatus status : files) {
                String fileName = status.getPath().getName();
                try {
                    Long tmpSeqId = Long.parseLong(fileName.substring(0, fileName.length() - SEQUENCE_ID_FILE_SUFFIX_LENGTH));
                    maxSeqId = Math.max(tmpSeqId, maxSeqId);
                } catch (NumberFormatException ex) {
                    LOG.warn("Invalid SeqId File Name=" + fileName);
                }
            }
        }
    }
    if (maxSeqId > newSeqId) {
        newSeqId = maxSeqId;
    }
    // bump up SeqId
    newSeqId += saftyBumper;
    // write a new seqId file
    Path newSeqIdFile = new Path(editsdir, newSeqId + SEQUENCE_ID_FILE_SUFFIX);
    if (newSeqId != maxSeqId) {
        try {
            if (!fs.createNewFile(newSeqIdFile) && !fs.exists(newSeqIdFile)) {
                throw new IOException("Failed to create SeqId file:" + newSeqIdFile);
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Wrote region seqId=" + newSeqIdFile + " to file, newSeqId=" + newSeqId + ", maxSeqId=" + maxSeqId);
            }
        } catch (FileAlreadyExistsException ignored) {
        // latest hdfs throws this exception. it's all right if newSeqIdFile already exists
        }
    }
    // remove old ones
    if (files != null) {
        for (FileStatus status : files) {
            if (newSeqIdFile.equals(status.getPath())) {
                continue;
            }
            fs.delete(status.getPath(), false);
        }
    }
    return newSeqId;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileAlreadyExistsException(org.apache.hadoop.fs.FileAlreadyExistsException) FileStatus(org.apache.hadoop.fs.FileStatus) AtomicLong(java.util.concurrent.atomic.AtomicLong) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) MultipleIOException(org.apache.hadoop.io.MultipleIOException)

Example 69 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class FileInputFormat method listStatus.

/** List input directories.
   * Subclasses may override to, e.g., select only files matching a regular
   * expression. 
   * 
   * @param job the job to list input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);
    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false);
    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    FileStatus[] result;
    int numThreads = job.getInt(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.LIST_STATUS_NUM_THREADS, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
    StopWatch sw = new StopWatch().start();
    if (numThreads == 1) {
        List<FileStatus> locatedFiles = singleThreadedListStatus(job, dirs, inputFilter, recursive);
        result = locatedFiles.toArray(new FileStatus[locatedFiles.size()]);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job, dirs, recursive, inputFilter, false);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Iterables.toArray(locatedFiles, FileStatus.class);
    }
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
    }
    LOG.info("Total input files to process : " + result.length);
    return result;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) StopWatch(org.apache.hadoop.util.StopWatch)

Example 70 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class CombineFileInputFormat method createPool.

/**
   * Create a new pool and add the filters to it. 
   * A pathname can satisfy any one of the specified filters.
   * A split cannot have files from different pools.
   */
protected void createPool(PathFilter... filters) {
    MultiPathFilter multi = new MultiPathFilter();
    for (PathFilter f : filters) {
        multi.add(f);
    }
    pools.add(multi);
}

Also used : PathFilter(org.apache.hadoop.fs.PathFilter)

Aggregations

PathFilter (org.apache.hadoop.fs.PathFilter)123 Path (org.apache.hadoop.fs.Path)114 FileStatus (org.apache.hadoop.fs.FileStatus)96 Test (org.junit.Test)47 IOException (java.io.IOException)42 FileSystem (org.apache.hadoop.fs.FileSystem)39 ArrayList (java.util.ArrayList)22 List (java.util.List)19 Configuration (org.apache.hadoop.conf.Configuration)18 Collections (java.util.Collections)11 BufferedReader (java.io.BufferedReader)9 InputStreamReader (java.io.InputStreamReader)9 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)9 Assert.assertEquals (org.junit.Assert.assertEquals)9 Assert.assertTrue (org.junit.Assert.assertTrue)9 URI (java.net.URI)8 Test (org.testng.annotations.Test)8 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)7 IGNORED (com.facebook.presto.hive.NestedDirectoryPolicy.IGNORED)6 RECURSE (com.facebook.presto.hive.NestedDirectoryPolicy.RECURSE)6