Examples with FileSystem - org.apache.flink.core.fs.FileSystem

Example 16 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class BinaryInputFormat method createStatistics.

/**
	 * Fill in the statistics. The last modification time and the total input size are prefilled.
	 *
	 * @param files
	 *        The files that are associated with this block input format.
	 * @param stats
	 *        The pre-filled statistics.
	 */
protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException {
    if (files.isEmpty()) {
        return null;
    }
    BlockInfo blockInfo = new BlockInfo();
    long totalCount = 0;
    for (FileStatus file : files) {
        // invalid file
        if (file.getLen() < blockInfo.getInfoSize()) {
            continue;
        }
        FileSystem fs = file.getPath().getFileSystem();
        try (FSDataInputStream fdis = fs.open(file.getPath(), blockInfo.getInfoSize())) {
            fdis.seek(file.getLen() - blockInfo.getInfoSize());
            blockInfo.read(new DataInputViewStreamWrapper(fdis));
            totalCount += blockInfo.getAccumulatedRecordCount();
        }
    }
    final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount);
    return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount);
}

Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) FSDataInputStream(org.apache.flink.core.fs.FSDataInputStream) DataInputViewStreamWrapper(org.apache.flink.core.memory.DataInputViewStreamWrapper)

Example 17 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class DelimitedInputFormat method getStatistics.

@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    // store properties
    final long oldTimeout = this.openTimeout;
    final int oldBufferSize = this.bufferSize;
    final int oldLineLengthLimit = this.lineLengthLimit;
    try {
        final Path filePath = this.filePath;
        // get the filesystem
        final FileSystem fs = FileSystem.get(filePath.toUri());
        final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
        // let the file input format deal with the up-to-date check and the basic size
        final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
        if (stats == null) {
            return null;
        }
        // in both cases, we return the stats as they are
        if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN || stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
            return stats;
        }
        // TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
        if (unsplittable) {
            return stats;
        }
        // compute how many samples to take, depending on the defined upper and lower bound
        final int numSamples;
        if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
            numSamples = this.numLineSamples;
        } else {
            // make the samples small for very small files
            final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
            numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
        }
        // check if sampling is disabled.
        if (numSamples == 0) {
            return stats;
        }
        if (numSamples < 0) {
            throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
        }
        // make sure that the sampling times out after a while if the file system does not answer in time
        this.openTimeout = 10000;
        // set a small read buffer size
        this.bufferSize = 4 * 1024;
        // prevent overly large records, for example if we have an incorrectly configured delimiter
        this.lineLengthLimit = MAX_SAMPLE_LEN;
        long offset = 0;
        long totalNumBytes = 0;
        long stepSize = stats.getTotalInputSize() / numSamples;
        int fileNum = 0;
        int samplesTaken = 0;
        // take the samples
        while (samplesTaken < numSamples && fileNum < allFiles.size()) {
            // make a split for the sample and use it to read a record
            FileStatus file = allFiles.get(fileNum);
            FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);
            // we open the split, read one line, and take its length
            try {
                open(split);
                if (readLine()) {
                    totalNumBytes += this.currLen + this.delimiter.length;
                    samplesTaken++;
                }
            } finally {
                // close the file stream, do not release the buffers
                super.close();
            }
            offset += stepSize;
            // skip to the next file, if necessary
            while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
                offset -= file.getLen();
                fileNum++;
            }
        }
        // we have the width, store it
        return new FileBaseStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
        }
    } finally {
        // restore properties (even on return)
        this.openTimeout = oldTimeout;
        this.bufferSize = oldBufferSize;
        this.lineLengthLimit = oldLineLengthLimit;
    }
    // no statistics possible
    return null;
}

Also used : Path(org.apache.flink.core.fs.Path) FileInputSplit(org.apache.flink.core.fs.FileInputSplit) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 18 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileInputFormat method addFilesInDir.

/**
	 * Enumerate all files in the directory and recursive if enumerateNestedFiles is true.
	 * @return the total length of accepted files.
	 */
private long addFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException {
    final FileSystem fs = path.getFileSystem();
    long length = 0;
    for (FileStatus dir : fs.listStatus(path)) {
        if (dir.isDir()) {
            if (acceptFile(dir) && enumerateNestedFiles) {
                length += addFilesInDir(dir.getPath(), files, logExcludedFiles);
            } else {
                if (logExcludedFiles && LOG.isDebugEnabled()) {
                    LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        } else {
            if (acceptFile(dir)) {
                files.add(dir);
                length += dir.getLen();
                testForUnsplittable(dir);
            } else {
                if (logExcludedFiles && LOG.isDebugEnabled()) {
                    LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded.");
                }
            }
        }
    }
    return length;
}

Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem)

Example 19 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileInputFormat method getStatistics.

/**
	 * Obtains basic file statistics containing only file size. If the input is a directory, then the size is the sum of all contained files.
	 * 
	 * @see org.apache.flink.api.common.io.InputFormat#getStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics)
	 */
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null;
    try {
        final Path path = this.filePath;
        final FileSystem fs = FileSystem.get(path.toUri());
        return getFileStats(cachedFileStats, path, fs, new ArrayList<FileStatus>(1));
    } catch (IOException ioex) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage());
        }
    } catch (Throwable t) {
        if (LOG.isErrorEnabled()) {
            LOG.error("Unexpected problem while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t);
        }
    }
    // no statistics available
    return null;
}

Also used : Path(org.apache.flink.core.fs.Path) FileStatus(org.apache.flink.core.fs.FileStatus) FileSystem(org.apache.flink.core.fs.FileSystem) IOException(java.io.IOException)

Example 20 with FileSystem

use of org.apache.flink.core.fs.FileSystem in project flink by apache.

the class FileOutputFormat method open.

@Override
public void open(int taskNumber, int numTasks) throws IOException {
    if (taskNumber < 0 || numTasks < 1) {
        throw new IllegalArgumentException("TaskNumber: " + taskNumber + ", numTasks: " + numTasks);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Opening stream for output (" + (taskNumber + 1) + "/" + numTasks + "). WriteMode=" + writeMode + ", OutputDirectoryMode=" + outputDirectoryMode);
    }
    Path p = this.outputFilePath;
    if (p == null) {
        throw new IOException("The file path is null.");
    }
    final FileSystem fs = p.getFileSystem();
    // if this is a local file system, we need to initialize the local output directory here
    if (!fs.isDistributedFS()) {
        if (numTasks == 1 && outputDirectoryMode == OutputDirectoryMode.PARONLY) {
            // prepare local output path. checks for write mode and removes existing files in case of OVERWRITE mode
            if (!fs.initOutPathLocalFS(p, writeMode, false)) {
                // output preparation failed! Cancel task.
                throw new IOException("Output path '" + p.toString() + "' could not be initialized. Canceling task...");
            }
        } else {
            if (!fs.initOutPathLocalFS(p, writeMode, true)) {
                // output preparation failed! Cancel task.
                throw new IOException("Output directory '" + p.toString() + "' could not be created. Canceling task...");
            }
        }
    }
    // Suffix the path with the parallel instance index, if needed
    this.actualFilePath = (numTasks > 1 || outputDirectoryMode == OutputDirectoryMode.ALWAYS) ? p.suffix("/" + getDirectoryFileName(taskNumber)) : p;
    // create output file
    this.stream = fs.create(this.actualFilePath, writeMode);
    // at this point, the file creation must have succeeded, or an exception has been thrown
    this.fileCreated = true;
}

Also used : Path(org.apache.flink.core.fs.Path) FileSystem(org.apache.flink.core.fs.FileSystem) IOException(java.io.IOException)

Aggregations

FileSystem (org.apache.flink.core.fs.FileSystem)41 Path (org.apache.flink.core.fs.Path)34 IOException (java.io.IOException)18 FileStatus (org.apache.flink.core.fs.FileStatus)13 ArrayList (java.util.ArrayList)8 Test (org.junit.Test)8 FSDataInputStream (org.apache.flink.core.fs.FSDataInputStream)6 FSDataOutputStream (org.apache.flink.core.fs.FSDataOutputStream)6 File (java.io.File)5 URI (java.net.URI)5 URISyntaxException (java.net.URISyntaxException)4 FileNotFoundException (java.io.FileNotFoundException)3 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)3 DataInputViewStreamWrapper (org.apache.flink.core.memory.DataInputViewStreamWrapper)3 FileStateHandle (org.apache.flink.runtime.state.filesystem.FileStateHandle)3 DataOutputStream (java.io.DataOutputStream)2 InputStream (java.io.InputStream)2 Field (java.lang.reflect.Field)2 Map (java.util.Map)2 FileBaseStatistics (org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics)2