use of scala.Tuple2 in project cdap by caskdata.
the class WordCountSink method run.
@Override
public void run(SparkExecutionPluginContext sparkExecutionPluginContext, JavaRDD<StructuredRecord> javaRDD) throws Exception {
WordCount wordCount = new WordCount(config.field);
JavaPairRDD outputRDD = wordCount.countWords(javaRDD).mapToPair(new PairFunction<Tuple2<String, Long>, byte[], byte[]>() {
@Override
public Tuple2<byte[], byte[]> call(Tuple2<String, Long> stringLongTuple2) throws Exception {
return new Tuple2<>(Bytes.toBytes(stringLongTuple2._1()), Bytes.toBytes(stringLongTuple2._2()));
}
});
sparkExecutionPluginContext.saveAsDataset(outputRDD, config.tableName);
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ParallelCopyGCSDirectoryIntoHDFSSpark method concatenateChunks.
private void concatenateChunks(final String outputDirectoryFinal, final FileSystem fs, final List<Path> gcsNIOPaths, final Map<String, Iterable<Tuple2<Integer, String>>> chunksByFilePath) throws IOException {
for (Path path : gcsNIOPaths) {
if (Files.isDirectory(path)) {
continue;
}
final String filePath = path.toUri().toString();
final Iterable<Tuple2<Integer, String>> chunkListForFile = chunksByFilePath.get(filePath);
final String basename = path.getName(path.getNameCount() - 1).toString();
final org.apache.hadoop.fs.Path outFilePath = new org.apache.hadoop.fs.Path(outputDirectoryFinal + "/" + basename);
fs.createNewFile(outFilePath);
SortedMap<Integer, String> chunkMap = new TreeMap<>();
for (Tuple2<Integer, String> entry : chunkListForFile) {
chunkMap.put(entry._1(), entry._2());
}
org.apache.hadoop.fs.Path[] chunkPaths = new org.apache.hadoop.fs.Path[chunkMap.size()];
final Iterator<Integer> iterator = chunkMap.keySet().iterator();
while (iterator.hasNext()) {
final Integer next = iterator.next();
final String chunkPath = chunkMap.get(next);
chunkPaths[next] = new org.apache.hadoop.fs.Path(chunkPath);
}
fs.concat(outFilePath, chunkPaths);
}
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class PSPairedUnpairedSplitterSpark method mapPartitionsToPairedAndUnpairedLists.
/**
* Maps each partition to a Tuple of two Lists, the first containing the paired reads, the second containing unpaired
*/
private static Iterator<Tuple2<List<GATKRead>, List<GATKRead>>> mapPartitionsToPairedAndUnpairedLists(final Iterator<GATKRead> iter, final int readsPerPartitionGuess) {
//Find the paired and unpaired reads by scanning the partition for repeated names
final List<GATKRead> pairedReadsList = new ArrayList<>(readsPerPartitionGuess);
final Map<String, GATKRead> unpairedReads = new HashMap<>(readsPerPartitionGuess);
while (iter.hasNext()) {
final GATKRead read = iter.next();
final String readName = read.getName();
//If read's mate is already in unpairedReads then we have a pair, which gets added to the ordered List
if (unpairedReads.containsKey(readName)) {
pairedReadsList.add(read);
pairedReadsList.add(unpairedReads.remove(readName));
} else {
unpairedReads.put(readName, read);
}
}
//Get the unpaired reads out of the hashmap
final List<GATKRead> unpairedReadsList = new ArrayList<>(unpairedReads.values());
//Minimize unpairedReads memory footprint (don't rely on readsPerPartitionGuess)
final List<GATKRead> pairedReadsListResized = new ArrayList<>(pairedReadsList.size());
pairedReadsListResized.addAll(pairedReadsList);
//Wrap and return the result
return Collections.singletonList(new Tuple2<>(pairedReadsListResized, unpairedReadsList)).iterator();
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ParallelCopyGCSDirectoryIntoHDFSSpark method setupChunks.
private List<Tuple2<String, Integer>> setupChunks(final long chunkSize, final List<Path> gcsNIOPaths) throws IOException {
List<Tuple2<String, Integer>> chunkList = new ArrayList<>();
for (Path path : gcsNIOPaths) {
if (Files.isDirectory(path)) {
logger.info("skipping directory " + path);
continue;
}
final long fileSize = Files.size(path);
final long chunks = fileSize / chunkSize + (fileSize % chunkSize == 0 ? 0 : 1);
logger.info("processing path " + path + ", size = " + fileSize + ", chunks = " + chunks);
for (int i = 0; i < chunks; i++) {
chunkList.add(new Tuple2<>(path.toUri().toString(), i));
}
}
return chunkList;
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ParallelCopyGCSDirectoryIntoHDFSSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
if (!BucketUtils.isCloudStorageUrl(inputGCSPath)) {
throw new UserException("Input path " + inputGCSPath + " is not a GCS URI");
}
if (!BucketUtils.isHadoopUrl(outputHDFSDirectory)) {
throw new UserException("Output directory " + outputHDFSDirectory + " is not an HDFS URI");
}
final String inputGCSPathFinal = inputGCSPath;
final String outputDirectoryFinal = outputHDFSDirectory;
org.apache.hadoop.fs.Path outputHdfsDirectoryPath = new org.apache.hadoop.fs.Path(outputHDFSDirectory);
try (FileSystem fs = outputHdfsDirectoryPath.getFileSystem(new Configuration())) {
if (fs.exists(outputHdfsDirectoryPath)) {
throw new UserException("Specified output directory " + outputHdfsDirectoryPath + " already exists. Please specify a new directory name.");
}
fs.mkdirs(outputHdfsDirectoryPath);
final long chunkSize = getChunkSize(fs);
final List<Path> gcsNIOPaths = getGCSFilePathsToCopy(inputGCSPathFinal);
List<Tuple2<String, Integer>> chunkList = setupChunks(chunkSize, gcsNIOPaths);
if (chunkList.size() == 0) {
logger.info("no files found to copy");
return;
}
final JavaPairRDD<String, Integer> chunkRDD = ctx.parallelizePairs(chunkList, chunkList.size());
final JavaPairRDD<String, Tuple2<Integer, String>> chunkMappingRDD = chunkRDD.mapToPair(p -> new Tuple2<>(p._1(), readChunkToHdfs(p._1(), chunkSize, p._2(), outputDirectoryFinal)));
final Map<String, Iterable<Tuple2<Integer, String>>> chunksByFilePath = chunkMappingRDD.groupByKey().collectAsMap();
concatenateChunks(outputDirectoryFinal, fs, gcsNIOPaths, chunksByFilePath);
} catch (NoSuchFileException e) {
throw new UserException("Could not locate input path " + e.getFile() + ". If you are trying to copy an entire directory, please include a trailing slash on your path.");
} catch (IOException e) {
throw new GATKException(e.getMessage(), e);
}
}
Aggregations