use of org.apache.hadoop.fs.ContentSummary in project gatk by broadinstitute.
the class RunSGAViaProcessBuilderOnSpark method loadFASTQFiles.
/**
* Load the FASTQ files in the user specified directory and returns an RDD that satisfies the same requirement
* as described in {@link JavaSparkContext#wholeTextFiles(String, int)}.
* @param ctx
* @param pathToAllInterleavedFASTQFiles path to the directory where all FASTQ files to perform local assembly upon are located
* @throws GATKException when getting the file count in the specified directory
* @return
*/
private static JavaPairRDD<String, String> loadFASTQFiles(final JavaSparkContext ctx, final String pathToAllInterleavedFASTQFiles) {
try {
final FileSystem hadoopFileSystem = FileSystem.get(ctx.hadoopConfiguration());
final ContentSummary cs = hadoopFileSystem.getContentSummary(new org.apache.hadoop.fs.Path(pathToAllInterleavedFASTQFiles));
final int fileCount = (int) cs.getFileCount();
return ctx.wholeTextFiles(pathToAllInterleavedFASTQFiles, fileCount);
} catch (final IOException e) {
throw new GATKException(e.getMessage());
}
}
Aggregations