use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class GenerateData method publishPlainDataStatistics.
static DataStatistics publishPlainDataStatistics(Configuration conf, Path inputDir) throws IOException {
FileSystem fs = inputDir.getFileSystem(conf);
// obtain input data file statuses
long dataSize = 0;
long fileCount = 0;
RemoteIterator<LocatedFileStatus> iter = fs.listFiles(inputDir, true);
PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
while (iter.hasNext()) {
LocatedFileStatus lStatus = iter.next();
if (filter.accept(lStatus.getPath())) {
dataSize += lStatus.getLen();
++fileCount;
}
}
// publish the plain data statistics
LOG.info("Total size of input data : " + StringUtils.humanReadableInt(dataSize));
LOG.info("Total number of input data files : " + fileCount);
return new DataStatistics(dataSize, fileCount, false);
}
use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class FileInputFormat method listStatus.
/** List input directories.
* Subclasses may override to, e.g., select only files matching a regular
* expression.
*
* @param job the job to list input paths for
* @return array of FileStatus objects
* @throws IOException if zero items.
*/
protected List<FileStatus> listStatus(JobContext job) throws IOException {
Path[] dirs = getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
// get tokens for all the required FileSystems..
TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());
// Whether we need to recursive look into the directory structure
boolean recursive = getInputDirRecursive(job);
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List<PathFilter> filters = new ArrayList<PathFilter>();
filters.add(hiddenFileFilter);
PathFilter jobFilter = getInputPathFilter(job);
if (jobFilter != null) {
filters.add(jobFilter);
}
PathFilter inputFilter = new MultiPathFilter(filters);
List<FileStatus> result = null;
int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
StopWatch sw = new StopWatch().start();
if (numThreads == 1) {
result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
} else {
Iterable<FileStatus> locatedFiles = null;
try {
LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job.getConfiguration(), dirs, recursive, inputFilter, true);
locatedFiles = locatedFileStatusFetcher.getFileStatuses();
} catch (InterruptedException e) {
throw new IOException("Interrupted while getting file statuses");
}
result = Lists.newArrayList(locatedFiles);
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
}
LOG.info("Total input files to process : " + result.size());
return result;
}
use of org.apache.hadoop.fs.PathFilter in project hbase by apache.
the class HFileArchiver method archiveRegion.
/**
* Remove an entire region from the table directory via archiving the region's hfiles.
* @param fs {@link FileSystem} from which to remove the region
* @param rootdir {@link Path} to the root directory where hbase files are stored (for building
* the archive path)
* @param tableDir {@link Path} to where the table is being stored (for building the archive path)
* @param regionDir {@link Path} to where a region is being stored (for building the archive path)
* @return <tt>true</tt> if the region was sucessfully deleted. <tt>false</tt> if the filesystem
* operations could not complete.
* @throws IOException if the request cannot be completed
*/
public static boolean archiveRegion(FileSystem fs, Path rootdir, Path tableDir, Path regionDir) throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("ARCHIVING " + regionDir.toString());
}
// make sure we can archive
if (tableDir == null || regionDir == null) {
LOG.error("No archive directory could be found because tabledir (" + tableDir + ") or regiondir (" + regionDir + "was null. Deleting files instead.");
deleteRegionWithoutArchiving(fs, regionDir);
// the archived files correctly or not.
return false;
}
// make sure the regiondir lives under the tabledir
Preconditions.checkArgument(regionDir.toString().startsWith(tableDir.toString()));
Path regionArchiveDir = HFileArchiveUtil.getRegionArchiveDir(rootdir, FSUtils.getTableName(tableDir), regionDir.getName());
FileStatusConverter getAsFile = new FileStatusConverter(fs);
// otherwise, we attempt to archive the store files
// build collection of just the store directories to archive
Collection<File> toArchive = new ArrayList<>();
final PathFilter dirFilter = new FSUtils.DirFilter(fs);
PathFilter nonHidden = new PathFilter() {
@Override
public boolean accept(Path file) {
return dirFilter.accept(file) && !file.getName().toString().startsWith(".");
}
};
FileStatus[] storeDirs = FSUtils.listStatus(fs, regionDir, nonHidden);
// if there no files, we can just delete the directory and return;
if (storeDirs == null) {
LOG.debug("Region directory (" + regionDir + ") was empty, just deleting and returning!");
return deleteRegionWithoutArchiving(fs, regionDir);
}
// convert the files in the region to a File
toArchive.addAll(Lists.transform(Arrays.asList(storeDirs), getAsFile));
LOG.debug("Archiving " + toArchive);
List<File> failedArchive = resolveAndArchive(fs, regionArchiveDir, toArchive, EnvironmentEdgeManager.currentTime());
if (!failedArchive.isEmpty()) {
throw new FailedArchiveException("Failed to archive/delete all the files for region:" + regionDir.getName() + " into " + regionArchiveDir + ". Something is probably awry on the filesystem.", Collections2.transform(failedArchive, FUNC_FILE_TO_PATH));
}
// if that was successful, then we delete the region
return deleteRegionWithoutArchiving(fs, regionDir);
}
use of org.apache.hadoop.fs.PathFilter in project hbase by apache.
the class WALSplitter method getSplitEditFilesSorted.
/**
* Returns sorted set of edit files made by splitter, excluding files
* with '.temp' suffix.
*
* @param fs
* @param regiondir
* @return Files in passed <code>regiondir</code> as a sorted set.
* @throws IOException
*/
public static NavigableSet<Path> getSplitEditFilesSorted(final FileSystem fs, final Path regiondir) throws IOException {
NavigableSet<Path> filesSorted = new TreeSet<>();
Path editsdir = getRegionDirRecoveredEditsDir(regiondir);
if (!fs.exists(editsdir))
return filesSorted;
FileStatus[] files = FSUtils.listStatus(fs, editsdir, new PathFilter() {
@Override
public boolean accept(Path p) {
boolean result = false;
try {
// Return files and only files that match the editfile names pattern.
// There can be other files in this directory other than edit files.
// In particular, on error, we'll move aside the bad edit file giving
// it a timestamp suffix. See moveAsideBadEditsFile.
Matcher m = EDITFILES_NAME_PATTERN.matcher(p.getName());
result = fs.isFile(p) && m.matches();
// because it means splitwal thread is writting this file.
if (p.getName().endsWith(RECOVERED_LOG_TMPFILE_SUFFIX)) {
result = false;
}
// Skip SeqId Files
if (isSequenceIdFile(p)) {
result = false;
}
} catch (IOException e) {
LOG.warn("Failed isFile check on " + p);
}
return result;
}
});
if (files == null) {
return filesSorted;
}
for (FileStatus status : files) {
filesSorted.add(status.getPath());
}
return filesSorted;
}
use of org.apache.hadoop.fs.PathFilter in project hbase by apache.
the class TestHFileArchiving method testDeleteRegionWithNoStoreFiles.
/**
* Test that the region directory is removed when we archive a region without store files, but
* still has hidden files.
* @throws Exception
*/
@Test
public void testDeleteRegionWithNoStoreFiles() throws Exception {
final TableName tableName = TableName.valueOf(name.getMethodName());
UTIL.createTable(tableName, TEST_FAM);
// get the current store files for the region
List<HRegion> servingRegions = UTIL.getHBaseCluster().getRegions(tableName);
// make sure we only have 1 region serving this table
assertEquals(1, servingRegions.size());
HRegion region = servingRegions.get(0);
FileSystem fs = region.getRegionFileSystem().getFileSystem();
// make sure there are some files in the regiondir
Path rootDir = FSUtils.getRootDir(fs.getConf());
Path regionDir = HRegion.getRegionDir(rootDir, region.getRegionInfo());
FileStatus[] regionFiles = FSUtils.listStatus(fs, regionDir, null);
Assert.assertNotNull("No files in the region directory", regionFiles);
if (LOG.isDebugEnabled()) {
List<Path> files = new ArrayList<>();
for (FileStatus file : regionFiles) {
files.add(file.getPath());
}
LOG.debug("Current files:" + files);
}
// delete the visible folders so we just have hidden files/folders
final PathFilter dirFilter = new FSUtils.DirFilter(fs);
PathFilter nonHidden = new PathFilter() {
@Override
public boolean accept(Path file) {
return dirFilter.accept(file) && !file.getName().toString().startsWith(".");
}
};
FileStatus[] storeDirs = FSUtils.listStatus(fs, regionDir, nonHidden);
for (FileStatus store : storeDirs) {
LOG.debug("Deleting store for test");
fs.delete(store.getPath(), true);
}
// then archive the region
HFileArchiver.archiveRegion(UTIL.getConfiguration(), fs, region.getRegionInfo());
// and check to make sure the region directoy got deleted
assertFalse("Region directory (" + regionDir + "), still exists.", fs.exists(regionDir));
UTIL.deleteTable(tableName);
}
Aggregations