Search in sources :

Example 21 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class HadoopInputFormatBase method getFileStats.

// --------------------------------------------------------------------------------------------
//  Helper methods
// --------------------------------------------------------------------------------------------
private FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, org.apache.hadoop.fs.Path[] hadoopFilePaths, ArrayList<FileStatus> files) throws IOException {
    long latestModTime = 0L;
    // get the file info and check whether the cached statistics are still valid.
    for (org.apache.hadoop.fs.Path hadoopPath : hadoopFilePaths) {
        final Path filePath = new Path(hadoopPath.toUri());
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final FileStatus file = fs.getFileStatus(filePath);
        latestModTime = Math.max(latestModTime, file.getModificationTime());
        // enumerate all files and check their modification time stamp.
        if (file.isDir()) {
            FileStatus[] fss = fs.listStatus(filePath);
            files.ensureCapacity(files.size() + fss.length);
            for (FileStatus s : fss) {
                if (!s.isDir()) {
                    files.add(s);
                    latestModTime = Math.max(s.getModificationTime(), latestModTime);
                }
            }
        } else {
            files.add(file);
        }
    }
    // check whether the cached statistics are still valid, if we have any
    if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
        return cachedStats;
    }
    // calculate the whole length
    long len = 0;
    for (FileStatus s : files) {
        len += s.getLen();
    }
    // sanity check
    if (len <= 0) {
        len = BaseStatistics.SIZE_UNKNOWN;
    }
    return new FileBaseStatistics(latestModTime, len, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileBaseStatistics(org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics) FileSystem(org.apache.flink.core.fs.FileSystem)

Example 22 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class MapRFileSystem method listStatus.

@Override
public FileStatus[] listStatus(final Path f) throws IOException {
    final org.apache.hadoop.fs.FileStatus[] hadoopFiles = this.fs.listStatus(new org.apache.hadoop.fs.Path(f.toString()));
    final FileStatus[] files = new FileStatus[hadoopFiles.length];
    // Convert types
    for (int i = 0; i < files.length; i++) {
        files[i] = new HadoopFileStatus(hadoopFiles[i]);
    }
    return files;
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) HadoopFileStatus(org.apache.flink.runtime.fs.hdfs.HadoopFileStatus) HadoopFileStatus(org.apache.flink.runtime.fs.hdfs.HadoopFileStatus)

Example 23 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class ContinuousFileMonitoringFunction method monitorDirAndForwardSplits.

private void monitorDirAndForwardSplits(FileSystem fs, SourceContext<TimestampedFileInputSplit> context) throws IOException {
    assert (Thread.holdsLock(checkpointLock));
    Map<Path, FileStatus> eligibleFiles = listEligibleFiles(fs, new Path(path));
    Map<Long, List<TimestampedFileInputSplit>> splitsSortedByModTime = getInputSplitsSortedByModTime(eligibleFiles);
    for (Map.Entry<Long, List<TimestampedFileInputSplit>> splits : splitsSortedByModTime.entrySet()) {
        long modificationTime = splits.getKey();
        for (TimestampedFileInputSplit split : splits.getValue()) {
            LOG.info("Forwarding split: " + split);
            context.collect(split);
        }
        // update the global modification time
        globalModificationTime = Math.max(globalModificationTime, modificationTime);
    }
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap)

Example 24 with FileStatus

use of org.apache.flink.core.fs.FileStatus in project flink by apache.

the class ContinuousFileMonitoringFunction method listEligibleFiles.

/**
	 * Returns the paths of the files not yet processed.
	 * @param fileSystem The filesystem where the monitored directory resides.
	 */
private Map<Path, FileStatus> listEligibleFiles(FileSystem fileSystem, Path path) throws IOException {
    final FileStatus[] statuses;
    try {
        statuses = fileSystem.listStatus(path);
    } catch (IOException e) {
        // delay the check for eligible files in this case
        return Collections.emptyMap();
    }
    if (statuses == null) {
        LOG.warn("Path does not exist: {}", path);
        return Collections.emptyMap();
    } else {
        Map<Path, FileStatus> files = new HashMap<>();
        // handle the new files
        for (FileStatus status : statuses) {
            if (!status.isDir()) {
                Path filePath = status.getPath();
                long modificationTime = status.getModificationTime();
                if (!shouldIgnore(filePath, modificationTime)) {
                    files.put(filePath, status);
                }
            } else if (format.getNestedFileEnumeration() && format.acceptFile(status)) {
                files.putAll(listEligibleFiles(fileSystem, status.getPath()));
            }
        }
        return files;
    }
}
Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) HashMap(java.util.HashMap) IOException(java.io.IOException)

Aggregations

FileStatus (org.apache.flink.core.fs.FileStatus)24 Path (org.apache.flink.core.fs.Path)16 FileSystem (org.apache.flink.core.fs.FileSystem)13 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)4 File (java.io.File)3 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)3 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)3 HashMap (java.util.HashMap)2 List (java.util.List)2 TreeMap (java.util.TreeMap)2 BlockLocation (org.apache.flink.core.fs.BlockLocation)2 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)2 DataInputViewStreamWrapper (org.apache.flink.core.memory.DataInputViewStreamWrapper)2 DataInputStream (java.io.DataInputStream)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1