Search in sources :

Example 61 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hive by apache.

the class Hive method listFilesCreatedByQuery.

private List<FileStatus> listFilesCreatedByQuery(Path loadPath, long writeId, int stmtId) throws HiveException {
    try {
        FileSystem srcFs = loadPath.getFileSystem(conf);
        PathFilter filter = new AcidUtils.IdFullPathFiler(writeId, stmtId, loadPath);
        return HdfsUtils.listLocatedFileStatus(srcFs, loadPath, filter, true);
    } catch (FileNotFoundException e) {
        LOG.info("directory does not exist: " + loadPath);
    } catch (IOException e) {
        LOG.error("Error listing files", e);
        throw new HiveException(e);
    }
    return Collections.EMPTY_LIST;
}
Also used : PathFilter(org.apache.hadoop.fs.PathFilter) FileSystem(org.apache.hadoop.fs.FileSystem) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException)

Example 62 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project hive by apache.

the class PreUpgradeTool method needsCompaction.

/**
 * @param location - path to a partition (or table if not partitioned) dir
 */
private static boolean needsCompaction(Path location, HiveConf conf, CompactionMetaInfo compactionMetaInfo, ValidTxnList txns) throws IOException {
    FileSystem fs = location.getFileSystem(conf);
    FileStatus[] deltas = fs.listStatus(location, new PathFilter() {

        @Override
        public boolean accept(Path path) {
            // which cannot produce any deltas with mix of update/insert events
            return path.getName().startsWith("delta_") || path.getName().startsWith("delete_delta_");
        }
    });
    if (deltas == null || deltas.length == 0) {
        // only if there are update/delete events.
        return false;
    }
    /*getAcidState() is smart not to return any deltas in current if there is a base that covers
    * them, i.e. if they were compacted but not yet cleaned.  This means re-checking if
    * compaction is needed should cheap(er)*/
    AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns);
    deltaLoop: for (AcidUtils.ParsedDelta delta : dir.getCurrentDirectories()) {
        FileStatus[] buckets = fs.listStatus(delta.getPath(), new PathFilter() {

            @Override
            public boolean accept(Path path) {
                // bucket_x or bucket_x__flush_length
                return path.getName().startsWith("bucket_");
            }
        });
        for (FileStatus bucket : buckets) {
            if (bucket.getPath().getName().endsWith("_flush_length")) {
                // streaming ingest dir - cannot have update/delete events
                continue deltaLoop;
            }
            if (needsCompaction(bucket, fs)) {
                // found delete events - this 'location' needs compacting
                compactionMetaInfo.addBytes(getDataSize(location, conf));
                // count at the size for 'cost' estimation later
                for (HadoopShims.HdfsFileStatusWithId origFile : dir.getOriginalFiles()) {
                    FileStatus fileStatus = origFile.getFileStatus();
                    if (fileStatus != null) {
                        compactionMetaInfo.addBytes(fileStatus.getLen());
                    }
                }
                return true;
            }
        }
    }
    return false;
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 63 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project shifu by ShifuML.

the class PostTrainModelProcessor method updateAvgScores.

private void updateAvgScores(SourceType source, String postTrainOutputPath) throws IOException {
    List<Scanner> scanners = null;
    try {
        scanners = ShifuFileUtils.getDataScanners(postTrainOutputPath, source, new PathFilter() {

            @Override
            public boolean accept(Path path) {
                return path.toString().contains("part-r-");
            }
        });
        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine().trim();
                String[] keyValues = line.split("\t");
                String key = keyValues[0];
                String value = keyValues[1];
                ColumnConfig config = this.columnConfigList.get(Integer.parseInt(key));
                List<Integer> binAvgScores = new ArrayList<Integer>();
                String[] avgScores = value.split(",");
                for (int i = 0; i < avgScores.length; i++) {
                    binAvgScores.add(Integer.parseInt(avgScores[i]));
                }
                config.setBinAvgScore(binAvgScores);
            }
        }
    } finally {
        // release
        closeScanners(scanners);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Scanner(java.util.Scanner) PathFilter(org.apache.hadoop.fs.PathFilter) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) ArrayList(java.util.ArrayList)

Example 64 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project shifu by ShifuML.

the class PostTrainModelProcessor method getFeatureImportance.

private List<Integer> getFeatureImportance(SourceType source, String output) throws IOException {
    List<Integer> featureImportance = new ArrayList<Integer>();
    List<Scanner> scanners = null;
    try {
        scanners = ShifuFileUtils.getDataScanners(output, source, new PathFilter() {

            @Override
            public boolean accept(Path path) {
                return path.toString().contains("part-r-");
            }
        });
        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine().trim();
                String[] keyValues = line.split("\t");
                String key = keyValues[0];
                featureImportance.add(Integer.parseInt(key));
            }
        }
    } finally {
        // release
        closeScanners(scanners);
    }
    return featureImportance;
}
Also used : Path(org.apache.hadoop.fs.Path) Scanner(java.util.Scanner) PathFilter(org.apache.hadoop.fs.PathFilter) ArrayList(java.util.ArrayList)

Example 65 with PathFilter

use of org.apache.hadoop.fs.PathFilter in project apex-malhar by apache.

the class FileStitcher method mergeBlocks.

protected void mergeBlocks(T stitchedFileMetaData) throws IOException {
    // when writing to tmp files there can be vagrant tmp files which we have to clean
    final Path dst = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath());
    PathFilter tempFileFilter = new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return path.getName().startsWith(dst.getName()) && path.getName().endsWith(PART_FILE_EXTENTION);
        }
    };
    if (outputFS.exists(dst.getParent())) {
        FileStatus[] statuses = outputFS.listStatus(dst.getParent(), tempFileFilter);
        for (FileStatus status : statuses) {
            String statusName = status.getPath().getName();
            LOG.debug("deleting vagrant file {}", statusName);
            outputFS.delete(status.getPath(), true);
        }
    }
    tempOutFilePath = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath() + '.' + System.currentTimeMillis() + PART_FILE_EXTENTION);
    try {
        writeTempOutputFile(stitchedFileMetaData);
        moveToFinalFile(stitchedFileMetaData);
    } catch (BlockNotFoundException e) {
        LOG.warn("Block file {} not found. Assuming recovery mode for file {}. ", e.getBlockPath(), stitchedFileMetaData.getStitchedFileRelativePath());
        // Remove temp output file
        outputFS.delete(tempOutFilePath, false);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus)

Aggregations

PathFilter (org.apache.hadoop.fs.PathFilter)123 Path (org.apache.hadoop.fs.Path)114 FileStatus (org.apache.hadoop.fs.FileStatus)96 Test (org.junit.Test)47 IOException (java.io.IOException)42 FileSystem (org.apache.hadoop.fs.FileSystem)39 ArrayList (java.util.ArrayList)22 List (java.util.List)19 Configuration (org.apache.hadoop.conf.Configuration)18 Collections (java.util.Collections)11 BufferedReader (java.io.BufferedReader)9 InputStreamReader (java.io.InputStreamReader)9 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)9 Assert.assertEquals (org.junit.Assert.assertEquals)9 Assert.assertTrue (org.junit.Assert.assertTrue)9 URI (java.net.URI)8 Test (org.testng.annotations.Test)8 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)7 IGNORED (com.facebook.presto.hive.NestedDirectoryPolicy.IGNORED)6 RECURSE (com.facebook.presto.hive.NestedDirectoryPolicy.RECURSE)6