use of org.apache.hadoop.fs.PathFilter in project hive by apache.
the class Hive method listFilesCreatedByQuery.
private List<FileStatus> listFilesCreatedByQuery(Path loadPath, long writeId, int stmtId) throws HiveException {
try {
FileSystem srcFs = loadPath.getFileSystem(conf);
PathFilter filter = new AcidUtils.IdFullPathFiler(writeId, stmtId, loadPath);
return HdfsUtils.listLocatedFileStatus(srcFs, loadPath, filter, true);
} catch (FileNotFoundException e) {
LOG.info("directory does not exist: " + loadPath);
} catch (IOException e) {
LOG.error("Error listing files", e);
throw new HiveException(e);
}
return Collections.EMPTY_LIST;
}
use of org.apache.hadoop.fs.PathFilter in project hive by apache.
the class PreUpgradeTool method needsCompaction.
/**
* @param location - path to a partition (or table if not partitioned) dir
*/
private static boolean needsCompaction(Path location, HiveConf conf, CompactionMetaInfo compactionMetaInfo, ValidTxnList txns) throws IOException {
FileSystem fs = location.getFileSystem(conf);
FileStatus[] deltas = fs.listStatus(location, new PathFilter() {
@Override
public boolean accept(Path path) {
// which cannot produce any deltas with mix of update/insert events
return path.getName().startsWith("delta_") || path.getName().startsWith("delete_delta_");
}
});
if (deltas == null || deltas.length == 0) {
// only if there are update/delete events.
return false;
}
/*getAcidState() is smart not to return any deltas in current if there is a base that covers
* them, i.e. if they were compacted but not yet cleaned. This means re-checking if
* compaction is needed should cheap(er)*/
AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns);
deltaLoop: for (AcidUtils.ParsedDelta delta : dir.getCurrentDirectories()) {
FileStatus[] buckets = fs.listStatus(delta.getPath(), new PathFilter() {
@Override
public boolean accept(Path path) {
// bucket_x or bucket_x__flush_length
return path.getName().startsWith("bucket_");
}
});
for (FileStatus bucket : buckets) {
if (bucket.getPath().getName().endsWith("_flush_length")) {
// streaming ingest dir - cannot have update/delete events
continue deltaLoop;
}
if (needsCompaction(bucket, fs)) {
// found delete events - this 'location' needs compacting
compactionMetaInfo.addBytes(getDataSize(location, conf));
// count at the size for 'cost' estimation later
for (HadoopShims.HdfsFileStatusWithId origFile : dir.getOriginalFiles()) {
FileStatus fileStatus = origFile.getFileStatus();
if (fileStatus != null) {
compactionMetaInfo.addBytes(fileStatus.getLen());
}
}
return true;
}
}
}
return false;
}
use of org.apache.hadoop.fs.PathFilter in project shifu by ShifuML.
the class PostTrainModelProcessor method updateAvgScores.
private void updateAvgScores(SourceType source, String postTrainOutputPath) throws IOException {
List<Scanner> scanners = null;
try {
scanners = ShifuFileUtils.getDataScanners(postTrainOutputPath, source, new PathFilter() {
@Override
public boolean accept(Path path) {
return path.toString().contains("part-r-");
}
});
for (Scanner scanner : scanners) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine().trim();
String[] keyValues = line.split("\t");
String key = keyValues[0];
String value = keyValues[1];
ColumnConfig config = this.columnConfigList.get(Integer.parseInt(key));
List<Integer> binAvgScores = new ArrayList<Integer>();
String[] avgScores = value.split(",");
for (int i = 0; i < avgScores.length; i++) {
binAvgScores.add(Integer.parseInt(avgScores[i]));
}
config.setBinAvgScore(binAvgScores);
}
}
} finally {
// release
closeScanners(scanners);
}
}
use of org.apache.hadoop.fs.PathFilter in project shifu by ShifuML.
the class PostTrainModelProcessor method getFeatureImportance.
private List<Integer> getFeatureImportance(SourceType source, String output) throws IOException {
List<Integer> featureImportance = new ArrayList<Integer>();
List<Scanner> scanners = null;
try {
scanners = ShifuFileUtils.getDataScanners(output, source, new PathFilter() {
@Override
public boolean accept(Path path) {
return path.toString().contains("part-r-");
}
});
for (Scanner scanner : scanners) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine().trim();
String[] keyValues = line.split("\t");
String key = keyValues[0];
featureImportance.add(Integer.parseInt(key));
}
}
} finally {
// release
closeScanners(scanners);
}
return featureImportance;
}
use of org.apache.hadoop.fs.PathFilter in project apex-malhar by apache.
the class FileStitcher method mergeBlocks.
protected void mergeBlocks(T stitchedFileMetaData) throws IOException {
// when writing to tmp files there can be vagrant tmp files which we have to clean
final Path dst = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath());
PathFilter tempFileFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(dst.getName()) && path.getName().endsWith(PART_FILE_EXTENTION);
}
};
if (outputFS.exists(dst.getParent())) {
FileStatus[] statuses = outputFS.listStatus(dst.getParent(), tempFileFilter);
for (FileStatus status : statuses) {
String statusName = status.getPath().getName();
LOG.debug("deleting vagrant file {}", statusName);
outputFS.delete(status.getPath(), true);
}
}
tempOutFilePath = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath() + '.' + System.currentTimeMillis() + PART_FILE_EXTENTION);
try {
writeTempOutputFile(stitchedFileMetaData);
moveToFinalFile(stitchedFileMetaData);
} catch (BlockNotFoundException e) {
LOG.warn("Block file {} not found. Assuming recovery mode for file {}. ", e.getBlockPath(), stitchedFileMetaData.getStitchedFileRelativePath());
// Remove temp output file
outputFS.delete(tempOutFilePath, false);
}
}
Aggregations