Examples with PathFilter - org.apache.hadoop.fs.PathFilter

Example 6 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class GenerateData method publishPlainDataStatistics.

static DataStatistics publishPlainDataStatistics(Configuration conf, Path inputDir) throws IOException {
    FileSystem fs = inputDir.getFileSystem(conf);
    // obtain input data file statuses
    long dataSize = 0;
    long fileCount = 0;
    RemoteIterator<LocatedFileStatus> iter = fs.listFiles(inputDir, true);
    PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
    while (iter.hasNext()) {
        LocatedFileStatus lStatus = iter.next();
        if (filter.accept(lStatus.getPath())) {
            dataSize += lStatus.getLen();
            ++fileCount;
        }
    }
    // publish the plain data statistics
    LOG.info("Total size of input data : " + StringUtils.humanReadableInt(dataSize));
    LOG.info("Total number of input data files : " + fileCount);
    return new DataStatistics(dataSize, fileCount, false);
}

Also used : PathFilter(org.apache.hadoop.fs.PathFilter) Utils(org.apache.hadoop.mapred.Utils) StringUtils(org.apache.hadoop.util.StringUtils) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Example 7 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.

the class FileInputFormat method listStatus.

/** List input directories.
   * Subclasses may override to, e.g., select only files matching a regular
   * expression. 
   * 
   * @param job the job to list input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());
    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);
    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    List<FileStatus> result = null;
    int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
    StopWatch sw = new StopWatch().start();
    if (numThreads == 1) {
        result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job.getConfiguration(), dirs, recursive, inputFilter, true);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Lists.newArrayList(locatedFiles);
    }
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
    }
    LOG.info("Total input files to process : " + result.size());
    return result;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) StopWatch(org.apache.hadoop.util.StopWatch) LocatedFileStatusFetcher(org.apache.hadoop.mapred.LocatedFileStatusFetcher)

Example 8 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hbase by apache.

the class HFileArchiver method archiveRegion.

/**
   * Remove an entire region from the table directory via archiving the region's hfiles.
   * @param fs {@link FileSystem} from which to remove the region
   * @param rootdir {@link Path} to the root directory where hbase files are stored (for building
   *          the archive path)
   * @param tableDir {@link Path} to where the table is being stored (for building the archive path)
   * @param regionDir {@link Path} to where a region is being stored (for building the archive path)
   * @return <tt>true</tt> if the region was sucessfully deleted. <tt>false</tt> if the filesystem
   *         operations could not complete.
   * @throws IOException if the request cannot be completed
   */
public static boolean archiveRegion(FileSystem fs, Path rootdir, Path tableDir, Path regionDir) throws IOException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("ARCHIVING " + regionDir.toString());
    }
    // make sure we can archive
    if (tableDir == null || regionDir == null) {
        LOG.error("No archive directory could be found because tabledir (" + tableDir + ") or regiondir (" + regionDir + "was null. Deleting files instead.");
        deleteRegionWithoutArchiving(fs, regionDir);
        // the archived files correctly or not.
        return false;
    }
    // make sure the regiondir lives under the tabledir
    Preconditions.checkArgument(regionDir.toString().startsWith(tableDir.toString()));
    Path regionArchiveDir = HFileArchiveUtil.getRegionArchiveDir(rootdir, FSUtils.getTableName(tableDir), regionDir.getName());
    FileStatusConverter getAsFile = new FileStatusConverter(fs);
    // otherwise, we attempt to archive the store files
    // build collection of just the store directories to archive
    Collection<File> toArchive = new ArrayList<>();
    final PathFilter dirFilter = new FSUtils.DirFilter(fs);
    PathFilter nonHidden = new PathFilter() {

        @Override
        public boolean accept(Path file) {
            return dirFilter.accept(file) && !file.getName().toString().startsWith(".");
        }
    };
    FileStatus[] storeDirs = FSUtils.listStatus(fs, regionDir, nonHidden);
    // if there no files, we can just delete the directory and return;
    if (storeDirs == null) {
        LOG.debug("Region directory (" + regionDir + ") was empty, just deleting and returning!");
        return deleteRegionWithoutArchiving(fs, regionDir);
    }
    // convert the files in the region to a File
    toArchive.addAll(Lists.transform(Arrays.asList(storeDirs), getAsFile));
    LOG.debug("Archiving " + toArchive);
    List<File> failedArchive = resolveAndArchive(fs, regionArchiveDir, toArchive, EnvironmentEdgeManager.currentTime());
    if (!failedArchive.isEmpty()) {
        throw new FailedArchiveException("Failed to archive/delete all the files for region:" + regionDir.getName() + " into " + regionArchiveDir + ". Something is probably awry on the filesystem.", Collections2.transform(failedArchive, FUNC_FILE_TO_PATH));
    }
    // if that was successful, then we delete the region
    return deleteRegionWithoutArchiving(fs, regionDir);
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) StoreFile(org.apache.hadoop.hbase.regionserver.StoreFile)

Example 9 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hbase by apache.

the class WALSplitter method getSplitEditFilesSorted.

/**
   * Returns sorted set of edit files made by splitter, excluding files
   * with '.temp' suffix.
   *
   * @param fs
   * @param regiondir
   * @return Files in passed <code>regiondir</code> as a sorted set.
   * @throws IOException
   */
public static NavigableSet<Path> getSplitEditFilesSorted(final FileSystem fs, final Path regiondir) throws IOException {
    NavigableSet<Path> filesSorted = new TreeSet<>();
    Path editsdir = getRegionDirRecoveredEditsDir(regiondir);
    if (!fs.exists(editsdir))
        return filesSorted;
    FileStatus[] files = FSUtils.listStatus(fs, editsdir, new PathFilter() {

        @Override
        public boolean accept(Path p) {
            boolean result = false;
            try {
                // Return files and only files that match the editfile names pattern.
                // There can be other files in this directory other than edit files.
                // In particular, on error, we'll move aside the bad edit file giving
                // it a timestamp suffix. See moveAsideBadEditsFile.
                Matcher m = EDITFILES_NAME_PATTERN.matcher(p.getName());
                result = fs.isFile(p) && m.matches();
                // because it means splitwal thread is writting this file.
                if (p.getName().endsWith(RECOVERED_LOG_TMPFILE_SUFFIX)) {
                    result = false;
                }
                // Skip SeqId Files
                if (isSequenceIdFile(p)) {
                    result = false;
                }
            } catch (IOException e) {
                LOG.warn("Failed isFile check on " + p);
            }
            return result;
        }
    });
    if (files == null) {
        return filesSorted;
    }
    for (FileStatus status : files) {
        filesSorted.add(status.getPath());
    }
    return filesSorted;
}

Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Matcher(java.util.regex.Matcher) TreeSet(java.util.TreeSet) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) MultipleIOException(org.apache.hadoop.io.MultipleIOException)

Example 10 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hbase by apache.

the class TestHFileArchiving method testDeleteRegionWithNoStoreFiles.

/**
   * Test that the region directory is removed when we archive a region without store files, but
   * still has hidden files.
   * @throws Exception
   */
@Test
public void testDeleteRegionWithNoStoreFiles() throws Exception {
    final TableName tableName = TableName.valueOf(name.getMethodName());
    UTIL.createTable(tableName, TEST_FAM);
    // get the current store files for the region
    List<HRegion> servingRegions = UTIL.getHBaseCluster().getRegions(tableName);
    // make sure we only have 1 region serving this table
    assertEquals(1, servingRegions.size());
    HRegion region = servingRegions.get(0);
    FileSystem fs = region.getRegionFileSystem().getFileSystem();
    // make sure there are some files in the regiondir
    Path rootDir = FSUtils.getRootDir(fs.getConf());
    Path regionDir = HRegion.getRegionDir(rootDir, region.getRegionInfo());
    FileStatus[] regionFiles = FSUtils.listStatus(fs, regionDir, null);
    Assert.assertNotNull("No files in the region directory", regionFiles);
    if (LOG.isDebugEnabled()) {
        List<Path> files = new ArrayList<>();
        for (FileStatus file : regionFiles) {
            files.add(file.getPath());
        }
        LOG.debug("Current files:" + files);
    }
    // delete the visible folders so we just have hidden files/folders
    final PathFilter dirFilter = new FSUtils.DirFilter(fs);
    PathFilter nonHidden = new PathFilter() {

        @Override
        public boolean accept(Path file) {
            return dirFilter.accept(file) && !file.getName().toString().startsWith(".");
        }
    };
    FileStatus[] storeDirs = FSUtils.listStatus(fs, regionDir, nonHidden);
    for (FileStatus store : storeDirs) {
        LOG.debug("Deleting store for test");
        fs.delete(store.getPath(), true);
    }
    // then archive the region
    HFileArchiver.archiveRegion(UTIL.getConfiguration(), fs, region.getRegionInfo());
    // and check to make sure the region directoy got deleted
    assertFalse("Region directory (" + regionDir + "), still exists.", fs.exists(regionDir));
    UTIL.deleteTable(tableName);
}

Also used : Path(org.apache.hadoop.fs.Path) HRegion(org.apache.hadoop.hbase.regionserver.HRegion) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Aggregations

PathFilter (org.apache.hadoop.fs.PathFilter)43 Path (org.apache.hadoop.fs.Path)41 FileStatus (org.apache.hadoop.fs.FileStatus)37 FileSystem (org.apache.hadoop.fs.FileSystem)18 IOException (java.io.IOException)16 ArrayList (java.util.ArrayList)11 Test (org.junit.Test)8 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)5 InterruptedIOException (java.io.InterruptedIOException)4 Configuration (org.apache.hadoop.conf.Configuration)3 Admin (org.apache.hadoop.hbase.client.Admin)3 Table (org.apache.hadoop.hbase.client.Table)3 HRegion (org.apache.hadoop.hbase.regionserver.HRegion)3 ZooKeeperWatcher (org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher)3 URI (java.net.URI)2 HashMap (java.util.HashMap)2 ExecutionException (java.util.concurrent.ExecutionException)2 Exchange (org.apache.camel.Exchange)2 Message (org.apache.camel.Message)2 DefaultMessage (org.apache.camel.impl.DefaultMessage)2