use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.
the class HdfsDataSegmentFinder method findSegments.
@Override
public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor) throws SegmentLoadingException {
final Set<DataSegment> segments = Sets.newHashSet();
final Path workingDirPath = new Path(workingDirPathStr);
FileSystem fs;
try {
fs = workingDirPath.getFileSystem(config);
log.info(fs.getScheme());
log.info("FileSystem URI:" + fs.getUri().toString());
if (!fs.exists(workingDirPath)) {
throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath);
}
if (!fs.isDirectory(workingDirPath)) {
throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath);
}
final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true);
while (it.hasNext()) {
final LocatedFileStatus locatedFileStatus = it.next();
final Path path = locatedFileStatus.getPath();
if (path.getName().endsWith("descriptor.json")) {
final Path indexZip;
final String[] descriptorParts = path.getName().split("_");
if (descriptorParts.length == 2 && descriptorParts[1].equals("descriptor.json") && StringUtils.isNumeric(descriptorParts[0])) {
indexZip = new Path(path.getParent(), String.format("%s_index.zip", descriptorParts[0]));
} else {
indexZip = new Path(path.getParent(), "index.zip");
}
if (fs.exists(indexZip)) {
final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class);
log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip);
final Map<String, Object> loadSpec = dataSegment.getLoadSpec();
final String pathWithoutScheme = indexZip.toUri().getPath();
if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME) || !loadSpec.get("path").equals(pathWithoutScheme)) {
loadSpec.put("type", HdfsStorageDruidModule.SCHEME);
loadSpec.put("path", pathWithoutScheme);
if (updateDescriptor) {
log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path, pathWithoutScheme);
mapper.writeValue(fs.create(path, true), dataSegment);
}
}
segments.add(dataSegment);
} else {
throw new SegmentLoadingException("index.zip didn't exist at [%s] while descripter.json exists!?", indexZip);
}
}
}
} catch (IOException e) {
throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath);
}
return segments;
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hbase by apache.
the class TestBackupLogCleaner method getListOfWALFiles.
private List<FileStatus> getListOfWALFiles(Configuration c) throws IOException {
Path logRoot = new Path(FSUtils.getRootDir(c), HConstants.HREGION_LOGDIR_NAME);
FileSystem fs = FileSystem.get(c);
RemoteIterator<LocatedFileStatus> it = fs.listFiles(logRoot, true);
List<FileStatus> logFiles = new ArrayList<FileStatus>();
while (it.hasNext()) {
LocatedFileStatus lfs = it.next();
if (lfs.isFile() && !AbstractFSWALProvider.isMetaFile(lfs.getPath())) {
logFiles.add(lfs);
LOG.info(lfs);
}
}
return logFiles;
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class GenerateData method publishPlainDataStatistics.
static DataStatistics publishPlainDataStatistics(Configuration conf, Path inputDir) throws IOException {
FileSystem fs = inputDir.getFileSystem(conf);
// obtain input data file statuses
long dataSize = 0;
long fileCount = 0;
RemoteIterator<LocatedFileStatus> iter = fs.listFiles(inputDir, true);
PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
while (iter.hasNext()) {
LocatedFileStatus lStatus = iter.next();
if (filter.accept(lStatus.getPath())) {
dataSize += lStatus.getLen();
++fileCount;
}
}
// publish the plain data statistics
LOG.info("Total size of input data : " + StringUtils.humanReadableInt(dataSize));
LOG.info("Total number of input data files : " + fileCount);
return new DataStatistics(dataSize, fileCount, false);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class S3AFileSystem method listLocatedStatus.
/**
* {@inheritDoc}.
*
* S3 Optimized directory listing. The initial operation performs the
* first bulk listing; extra listings will take place
* when all the current set of results are used up.
* @param f a path
* @param filter a path filter
* @return an iterator that traverses statuses of the files/directories
* in the given path
* @throws FileNotFoundException if {@code path} does not exist
* @throws IOException if any I/O error occurred
*/
@Override
public RemoteIterator<LocatedFileStatus> listLocatedStatus(final Path f, final PathFilter filter) throws FileNotFoundException, IOException {
incrementStatistic(INVOCATION_LIST_LOCATED_STATUS);
Path path = qualify(f);
LOG.debug("listLocatedStatus({}, {}", path, filter);
try {
// lookup dir triggers existence check
final FileStatus fileStatus = getFileStatus(path);
if (fileStatus.isFile()) {
// simple case: File
LOG.debug("Path is a file");
return new Listing.SingleStatusRemoteIterator(filter.accept(path) ? toLocatedFileStatus(fileStatus) : null);
} else {
// directory: trigger a lookup
String key = maybeAddTrailingSlash(pathToKey(path));
return listing.createLocatedFileStatusIterator(listing.createFileStatusListingIterator(path, createListObjectsRequest(key, "/"), filter, new Listing.AcceptAllButSelfAndS3nDirs(path)));
}
} catch (AmazonClientException e) {
throw translateException("listLocatedStatus", path, e);
}
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class S3AFileSystem method listFiles.
/**
* {@inheritDoc}.
*
* This implementation is optimized for S3, which can do a bulk listing
* off all entries under a path in one single operation. Thus there is
* no need to recursively walk the directory tree.
*
* Instead a {@link ListObjectsRequest} is created requesting a (windowed)
* listing of all entries under the given path. This is used to construct
* an {@code ObjectListingIterator} instance, iteratively returning the
* sequence of lists of elements under the path. This is then iterated
* over in a {@code FileStatusListingIterator}, which generates
* {@link S3AFileStatus} instances, one per listing entry.
* These are then translated into {@link LocatedFileStatus} instances.
*
* This is essentially a nested and wrapped set of iterators, with some
* generator classes; an architecture which may become less convoluted
* using lambda-expressions.
* @param f a path
* @param recursive if the subdirectories need to be traversed recursively
*
* @return an iterator that traverses statuses of the files/directories
* in the given path
* @throws FileNotFoundException if {@code path} does not exist
* @throws IOException if any I/O error occurred
*/
@Override
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive) throws FileNotFoundException, IOException {
incrementStatistic(INVOCATION_LIST_FILES);
Path path = qualify(f);
LOG.debug("listFiles({}, {})", path, recursive);
try {
// lookup dir triggers existence check
final FileStatus fileStatus = getFileStatus(path);
if (fileStatus.isFile()) {
// simple case: File
LOG.debug("Path is a file");
return new Listing.SingleStatusRemoteIterator(toLocatedFileStatus(fileStatus));
} else {
// directory: do a bulk operation
String key = maybeAddTrailingSlash(pathToKey(path));
String delimiter = recursive ? null : "/";
LOG.debug("Requesting all entries under {} with delimiter '{}'", key, delimiter);
return listing.createLocatedFileStatusIterator(listing.createFileStatusListingIterator(path, createListObjectsRequest(key, delimiter), ACCEPT_ALL, new Listing.AcceptFilesOnly(path)));
}
} catch (AmazonClientException e) {
throw translateException("listFiles", path, e);
}
}
Aggregations