Search in sources :

Example 96 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project incubator-gobblin by apache.

the class MetricsFileSystemInstrumentationTest method testGlobStatusWithFilter.

@Test(enabled = false)
public void testGlobStatusWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/GlobStatusWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem.get(new URI(instrumentedURI), new Configuration());
    FileStatus[] status = fs.globStatus(new Path("/tmp/GlobStatusWithFilter/*/*"), new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });
    Assert.assertEquals(fs.globStatusTimer.getCount(), 1);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) URI(java.net.URI) Test(org.testng.annotations.Test)

Example 97 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project incubator-gobblin by apache.

the class AvroKeyCompactorOutputCommitter method commitTask.

/**
 * Commits the task, moving files to their final committed location by delegating to
 * {@link FileOutputCommitter} to perform the actual moving. First, renames the
 * files to include the count of records contained within the file and a timestamp,
 * in the form {recordCount}.{timestamp}.avro. Then, the files are moved to their
 * committed location.
 */
@Override
public void commitTask(TaskAttemptContext context) throws IOException {
    Path workPath = getWorkPath();
    FileSystem fs = workPath.getFileSystem(context.getConfiguration());
    if (fs.exists(workPath)) {
        long recordCount = getRecordCountFromCounter(context, AvroKeyDedupReducer.EVENT_COUNTER.RECORD_COUNT);
        String fileNamePrefix;
        if (recordCount == 0) {
            // recordCount == 0 indicates that it is a map-only, non-dedup job, and thus record count should
            // be obtained from mapper counter.
            fileNamePrefix = CompactionRecordCountProvider.M_OUTPUT_FILE_PREFIX;
            recordCount = getRecordCountFromCounter(context, AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT);
        } else {
            fileNamePrefix = CompactionRecordCountProvider.MR_OUTPUT_FILE_PREFIX;
        }
        String fileName = CompactionRecordCountProvider.constructFileName(fileNamePrefix, recordCount);
        for (FileStatus status : fs.listStatus(workPath, new PathFilter() {

            @Override
            public boolean accept(Path path) {
                return FilenameUtils.isExtension(path.getName(), "avro");
            }
        })) {
            Path newPath = new Path(status.getPath().getParent(), fileName);
            LOG.info(String.format("Renaming %s to %s", status.getPath(), newPath));
            fs.rename(status.getPath(), newPath);
        }
    }
    super.commitTask(context);
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 98 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project incubator-gobblin by apache.

the class HiveSnapshotRegistrationPolicy method getLatestSnapshot.

/**
 * Get the latest snapshot in the given {@link Path}.
 *
 * <p>
 *   The lastest snapshot is a sub-directory of the input {@link Path} that has the largest folder
 *   name alphabetically. If property {@link #SNAPSHOT_PATH_PATTERN} is set, only those sub-directories
 *   whose full path matches the given pattern are considered.
 * </p>
 */
protected Path getLatestSnapshot(Path path) throws IOException {
    FileStatus[] statuses = this.fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path p) {
            try {
                if (!HiveSnapshotRegistrationPolicy.this.fs.isDirectory(p)) {
                    return false;
                }
            } catch (IOException e) {
                throw Throwables.propagate(e);
            }
            return !HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.isPresent() || HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.get().matcher(p.toString()).matches();
        }
    });
    if (statuses.length == 0) {
        return null;
    }
    Arrays.sort(statuses, new Comparator<FileStatus>() {

        @Override
        public int compare(FileStatus o1, FileStatus o2) {
            return o2.getPath().getName().compareTo(o1.getPath().getName());
        }
    });
    return statuses[0].getPath();
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) IOException(java.io.IOException)

Example 99 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project elephant-bird by twitter.

the class HdfsUtils method getDirectorySize.

/**
 * Calculates the total size of all the contents of a directory that are accepted
 * by filter. All subdirectories will be searched recursively and paths in subdirectories
 * that are accepted by filter will also be counted.
 *
 * Does not include the size of directories themselves
 * (which are 0 in HDFS but may not be 0 on local file systems)
 *
 * To get the size of a directory without filtering, use
 * {@link #getDirectorySize(Path, FileSystem)} which is much more efficient.
 *
 * @param path path to recursively walk
 * @param fs FileSystem for this path
 * @param filter path filter for which paths size's to include in the total
 *               NOTE: you do *not* need to filter out directories, this will be done for you
 * @return size of the directory in bytes
 * @throws IOException
 */
public static long getDirectorySize(Path path, FileSystem fs, PathFilter filter) throws IOException {
    PathSizeVisitor visitor = new PathSizeVisitor();
    PathFilter composite = new PathFilters.CompositePathFilter(PathFilters.newExcludeDirectoriesFilter(fs.getConf()), filter);
    walkPath(path, fs, composite, visitor);
    return visitor.getSize();
}
Also used : PathFilter(org.apache.hadoop.fs.PathFilter)

Example 100 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project incubator-gobblin by apache.

the class FileListUtilsTest method testListPathsRecursively.

@Test
public void testListPathsRecursively() throws IOException {
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    Path baseDir = new Path(FILE_UTILS_TEST_DIR, "fileListTestDir2");
    try {
        if (localFs.exists(baseDir)) {
            localFs.delete(baseDir, true);
        }
        localFs.mkdirs(baseDir);
        localFs.create(new Path(baseDir, TEST_FILE_NAME1));
        Path subDir = new Path(baseDir, "subDir");
        localFs.mkdirs(subDir);
        localFs.create(new Path(subDir, TEST_FILE_NAME2));
        List<FileStatus> testFiles = FileListUtils.listPathsRecursively(localFs, baseDir, new PathFilter() {

            @Override
            public boolean accept(Path path) {
                return true;
            }
        });
        Assert.assertEquals(4, testFiles.size());
        Set<String> fileNames = Sets.newHashSet();
        for (FileStatus testFileStatus : testFiles) {
            fileNames.add(testFileStatus.getPath().getName());
        }
        Set<String> expectedFileNames = Sets.newHashSet();
        expectedFileNames.add(baseDir.getName());
        expectedFileNames.add(subDir.getName());
        expectedFileNames.add(TEST_FILE_NAME1);
        expectedFileNames.add(TEST_FILE_NAME2);
        Assert.assertEquals(fileNames, expectedFileNames);
    } finally {
        localFs.delete(baseDir, true);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.testng.annotations.Test)

Aggregations

PathFilter (org.apache.hadoop.fs.PathFilter)123 Path (org.apache.hadoop.fs.Path)114 FileStatus (org.apache.hadoop.fs.FileStatus)96 Test (org.junit.Test)47 IOException (java.io.IOException)42 FileSystem (org.apache.hadoop.fs.FileSystem)39 ArrayList (java.util.ArrayList)22 List (java.util.List)19 Configuration (org.apache.hadoop.conf.Configuration)18 Collections (java.util.Collections)11 BufferedReader (java.io.BufferedReader)9 InputStreamReader (java.io.InputStreamReader)9 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)9 Assert.assertEquals (org.junit.Assert.assertEquals)9 Assert.assertTrue (org.junit.Assert.assertTrue)9 URI (java.net.URI)8 Test (org.testng.annotations.Test)8 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)7 IGNORED (com.facebook.presto.hive.NestedDirectoryPolicy.IGNORED)6 RECURSE (com.facebook.presto.hive.NestedDirectoryPolicy.RECURSE)6