Search in sources :

Example 1 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.

the class HdfsDataSegmentFinder method findSegments.

@Override
public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor) throws SegmentLoadingException {
    final Set<DataSegment> segments = Sets.newHashSet();
    final Path workingDirPath = new Path(workingDirPathStr);
    FileSystem fs;
    try {
        fs = workingDirPath.getFileSystem(config);
        log.info(fs.getScheme());
        log.info("FileSystem URI:" + fs.getUri().toString());
        if (!fs.exists(workingDirPath)) {
            throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath);
        }
        if (!fs.isDirectory(workingDirPath)) {
            throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath);
        }
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true);
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            final Path path = locatedFileStatus.getPath();
            if (path.getName().endsWith("descriptor.json")) {
                final Path indexZip;
                final String[] descriptorParts = path.getName().split("_");
                if (descriptorParts.length == 2 && descriptorParts[1].equals("descriptor.json") && StringUtils.isNumeric(descriptorParts[0])) {
                    indexZip = new Path(path.getParent(), String.format("%s_index.zip", descriptorParts[0]));
                } else {
                    indexZip = new Path(path.getParent(), "index.zip");
                }
                if (fs.exists(indexZip)) {
                    final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class);
                    log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip);
                    final Map<String, Object> loadSpec = dataSegment.getLoadSpec();
                    final String pathWithoutScheme = indexZip.toUri().getPath();
                    if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME) || !loadSpec.get("path").equals(pathWithoutScheme)) {
                        loadSpec.put("type", HdfsStorageDruidModule.SCHEME);
                        loadSpec.put("path", pathWithoutScheme);
                        if (updateDescriptor) {
                            log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path, pathWithoutScheme);
                            mapper.writeValue(fs.create(path, true), dataSegment);
                        }
                    }
                    segments.add(dataSegment);
                } else {
                    throw new SegmentLoadingException("index.zip didn't exist at [%s] while descripter.json exists!?", indexZip);
                }
            }
        }
    } catch (IOException e) {
        throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath);
    }
    return segments;
}
Also used : Path(org.apache.hadoop.fs.Path) SegmentLoadingException(io.druid.segment.loading.SegmentLoadingException) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) DataSegment(io.druid.timeline.DataSegment)

Example 2 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hbase by apache.

the class TestBackupLogCleaner method getListOfWALFiles.

private List<FileStatus> getListOfWALFiles(Configuration c) throws IOException {
    Path logRoot = new Path(FSUtils.getRootDir(c), HConstants.HREGION_LOGDIR_NAME);
    FileSystem fs = FileSystem.get(c);
    RemoteIterator<LocatedFileStatus> it = fs.listFiles(logRoot, true);
    List<FileStatus> logFiles = new ArrayList<FileStatus>();
    while (it.hasNext()) {
        LocatedFileStatus lfs = it.next();
        if (lfs.isFile() && !AbstractFSWALProvider.isMetaFile(lfs.getPath())) {
            logFiles.add(lfs);
            LOG.info(lfs);
        }
    }
    return logFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Example 3 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class GenerateData method publishPlainDataStatistics.

static DataStatistics publishPlainDataStatistics(Configuration conf, Path inputDir) throws IOException {
    FileSystem fs = inputDir.getFileSystem(conf);
    // obtain input data file statuses
    long dataSize = 0;
    long fileCount = 0;
    RemoteIterator<LocatedFileStatus> iter = fs.listFiles(inputDir, true);
    PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
    while (iter.hasNext()) {
        LocatedFileStatus lStatus = iter.next();
        if (filter.accept(lStatus.getPath())) {
            dataSize += lStatus.getLen();
            ++fileCount;
        }
    }
    // publish the plain data statistics
    LOG.info("Total size of input data : " + StringUtils.humanReadableInt(dataSize));
    LOG.info("Total number of input data files : " + fileCount);
    return new DataStatistics(dataSize, fileCount, false);
}
Also used : PathFilter(org.apache.hadoop.fs.PathFilter) Utils(org.apache.hadoop.mapred.Utils) StringUtils(org.apache.hadoop.util.StringUtils) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Example 4 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class S3AFileSystem method listLocatedStatus.

/**
   * {@inheritDoc}.
   *
   * S3 Optimized directory listing. The initial operation performs the
   * first bulk listing; extra listings will take place
   * when all the current set of results are used up.
   * @param f a path
   * @param filter a path filter
   * @return an iterator that traverses statuses of the files/directories
   *         in the given path
   * @throws FileNotFoundException if {@code path} does not exist
   * @throws IOException if any I/O error occurred
   */
@Override
public RemoteIterator<LocatedFileStatus> listLocatedStatus(final Path f, final PathFilter filter) throws FileNotFoundException, IOException {
    incrementStatistic(INVOCATION_LIST_LOCATED_STATUS);
    Path path = qualify(f);
    LOG.debug("listLocatedStatus({}, {}", path, filter);
    try {
        // lookup dir triggers existence check
        final FileStatus fileStatus = getFileStatus(path);
        if (fileStatus.isFile()) {
            // simple case: File
            LOG.debug("Path is a file");
            return new Listing.SingleStatusRemoteIterator(filter.accept(path) ? toLocatedFileStatus(fileStatus) : null);
        } else {
            // directory: trigger a lookup
            String key = maybeAddTrailingSlash(pathToKey(path));
            return listing.createLocatedFileStatusIterator(listing.createFileStatusListingIterator(path, createListObjectsRequest(key, "/"), filter, new Listing.AcceptAllButSelfAndS3nDirs(path)));
        }
    } catch (AmazonClientException e) {
        throw translateException("listLocatedStatus", path, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) AmazonClientException(com.amazonaws.AmazonClientException)

Example 5 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class S3AFileSystem method listFiles.

/**
   * {@inheritDoc}.
   *
   * This implementation is optimized for S3, which can do a bulk listing
   * off all entries under a path in one single operation. Thus there is
   * no need to recursively walk the directory tree.
   *
   * Instead a {@link ListObjectsRequest} is created requesting a (windowed)
   * listing of all entries under the given path. This is used to construct
   * an {@code ObjectListingIterator} instance, iteratively returning the
   * sequence of lists of elements under the path. This is then iterated
   * over in a {@code FileStatusListingIterator}, which generates
   * {@link S3AFileStatus} instances, one per listing entry.
   * These are then translated into {@link LocatedFileStatus} instances.
   *
   * This is essentially a nested and wrapped set of iterators, with some
   * generator classes; an architecture which may become less convoluted
   * using lambda-expressions.
   * @param f a path
   * @param recursive if the subdirectories need to be traversed recursively
   *
   * @return an iterator that traverses statuses of the files/directories
   *         in the given path
   * @throws FileNotFoundException if {@code path} does not exist
   * @throws IOException if any I/O error occurred
   */
@Override
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive) throws FileNotFoundException, IOException {
    incrementStatistic(INVOCATION_LIST_FILES);
    Path path = qualify(f);
    LOG.debug("listFiles({}, {})", path, recursive);
    try {
        // lookup dir triggers existence check
        final FileStatus fileStatus = getFileStatus(path);
        if (fileStatus.isFile()) {
            // simple case: File
            LOG.debug("Path is a file");
            return new Listing.SingleStatusRemoteIterator(toLocatedFileStatus(fileStatus));
        } else {
            // directory: do a bulk operation
            String key = maybeAddTrailingSlash(pathToKey(path));
            String delimiter = recursive ? null : "/";
            LOG.debug("Requesting all entries under {} with delimiter '{}'", key, delimiter);
            return listing.createLocatedFileStatusIterator(listing.createFileStatusListingIterator(path, createListObjectsRequest(key, delimiter), ACCEPT_ALL, new Listing.AcceptFilesOnly(path)));
        }
    } catch (AmazonClientException e) {
        throw translateException("listFiles", path, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) AmazonClientException(com.amazonaws.AmazonClientException)

Aggregations

LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)139 Path (org.apache.hadoop.fs.Path)104 FileSystem (org.apache.hadoop.fs.FileSystem)55 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)33 FileStatus (org.apache.hadoop.fs.FileStatus)29 IOException (java.io.IOException)27 Configuration (org.apache.hadoop.conf.Configuration)20 File (java.io.File)13 FileNotFoundException (java.io.FileNotFoundException)11 HashSet (java.util.HashSet)11 BlockLocation (org.apache.hadoop.fs.BlockLocation)9 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)7 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)7 StocatorPath (com.ibm.stocator.fs.common.StocatorPath)6 HashMap (java.util.HashMap)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 Map (java.util.Map)5 Matcher (java.util.regex.Matcher)5 BufferedReader (java.io.BufferedReader)4