use of org.apache.spark.api.java.JavaSparkContext in project cdap by caskdata.
the class NaiveBayesTrainer method run.
@Override
public void run(SparkExecutionPluginContext sparkContext, JavaRDD<StructuredRecord> input) throws Exception {
Preconditions.checkArgument(input.count() != 0, "Input RDD is empty.");
final HashingTF tf = new HashingTF(100);
JavaRDD<LabeledPoint> trainingData = input.map(new Function<StructuredRecord, LabeledPoint>() {
@Override
public LabeledPoint call(StructuredRecord record) throws Exception {
// should never happen, here to test app correctness in unit tests
if (inputSchema != null && !inputSchema.equals(record.getSchema())) {
throw new IllegalStateException("runtime schema does not match what was set at configure time.");
}
String text = record.get(config.fieldToClassify);
return new LabeledPoint((Double) record.get(config.predictionField), tf.transform(Lists.newArrayList(text.split(" "))));
}
});
trainingData.cache();
final NaiveBayesModel model = NaiveBayes.train(trainingData.rdd(), 1.0);
// save the model to a file in the output FileSet
JavaSparkContext javaSparkContext = sparkContext.getSparkContext();
FileSet outputFS = sparkContext.getDataset(config.fileSetName);
model.save(JavaSparkContext.toSparkContext(javaSparkContext), outputFS.getBaseLocation().append(config.path).toURI().getPath());
JavaPairRDD<Long, String> textsToClassify = sparkContext.fromStream(TEXTS_TO_CLASSIFY, String.class);
JavaRDD<Vector> featuresToClassify = textsToClassify.map(new Function<Tuple2<Long, String>, Vector>() {
@Override
public Vector call(Tuple2<Long, String> longWritableTextTuple2) throws Exception {
String text = longWritableTextTuple2._2();
return tf.transform(Lists.newArrayList(text.split(" ")));
}
});
JavaRDD<Double> predict = model.predict(featuresToClassify);
LOG.info("Predictions: {}", predict.collect());
// key the predictions with the message
JavaPairRDD<String, Double> keyedPredictions = textsToClassify.values().zip(predict);
// convert to byte[],byte[] to write to data
JavaPairRDD<byte[], byte[]> bytesRDD = keyedPredictions.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], byte[]>() {
@Override
public Tuple2<byte[], byte[]> call(Tuple2<String, Double> tuple) throws Exception {
return new Tuple2<>(Bytes.toBytes(tuple._1()), Bytes.toBytes(tuple._2()));
}
});
sparkContext.saveAsDataset(bytesRDD, CLASSIFIED_TEXTS);
}
use of org.apache.spark.api.java.JavaSparkContext in project cdap by caskdata.
the class FakeSpark method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
LOG.info("HelloFakeSpark");
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
LOG.info("Collected: {}", jsc.parallelize(data).collect());
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class CountBasesSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final JavaRDD<GATKRead> reads = getReads();
final long count = reads.map(r -> (long) r.getLength()).reduce(Long::sum);
System.out.println(count);
if (out != null) {
try (final PrintStream ps = new PrintStream(BucketUtils.createFile(out))) {
ps.print(count);
}
}
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk by broadinstitute.
the class CollectMultipleMetricsSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final JavaRDD<GATKRead> unFilteredReads = getUnfilteredReads();
List<SparkCollectorProvider> collectorsToRun = getCollectorsToRun();
if (collectorsToRun.size() > 1) {
// if there is more than one collector to run, cache the
// unfiltered RDD so we don't recompute it
unFilteredReads.cache();
}
for (final SparkCollectorProvider provider : collectorsToRun) {
MetricsCollectorSpark<? extends MetricsArgumentCollection> metricsCollector = provider.createCollector(outputBaseName, metricAccumulationLevel.accumulationLevels, getDefaultHeaders(), getHeaderForReads());
validateCollector(metricsCollector, collectorsToRun.get(collectorsToRun.indexOf(provider)).getClass().getName());
// Execute the collector's lifecycle
//Bypass the framework merging of command line filters and just apply the default
//ones specified by the collector
ReadFilter readFilter = ReadFilter.fromList(metricsCollector.getDefaultReadFilters(), getHeaderForReads());
metricsCollector.collectMetrics(unFilteredReads.filter(r -> readFilter.test(r)), getHeaderForReads());
metricsCollector.saveMetrics(getReadSourceName(), getAuthHolder());
}
}
use of org.apache.spark.api.java.JavaSparkContext in project gatk-protected by broadinstitute.
the class HDF5PCACoveragePoNCreationUtilsUnitTest method testCalculateReducedPanelAndPInversesKeepingHalfOfAllColumns.
@Test(dataProvider = "readCountOnlyWithDiverseShapeData")
public void testCalculateReducedPanelAndPInversesKeepingHalfOfAllColumns(final ReadCountCollection readCounts) {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final ReductionResult result = HDF5PCACoveragePoNCreationUtils.calculateReducedPanelAndPInverses(readCounts, OptionalInt.of(readCounts.columnNames().size() / 2), NULL_LOGGER, ctx);
final RealMatrix counts = readCounts.counts();
Assert.assertNotNull(result);
Assert.assertNotNull(result.getPseudoInverse());
Assert.assertNotNull(result.getReducedCounts());
Assert.assertNotNull(result.getReducedPseudoInverse());
Assert.assertNotNull(result.getAllSingularValues());
Assert.assertEquals(counts.getColumnDimension(), result.getAllSingularValues().length);
Assert.assertEquals(result.getReducedCounts().getRowDimension(), counts.getRowDimension());
Assert.assertEquals(result.getReducedCounts().getColumnDimension(), readCounts.columnNames().size() / 2);
final int eigensamples = result.getReducedCounts().getColumnDimension();
Assert.assertEquals(eigensamples, readCounts.columnNames().size() / 2);
assertPseudoInverse(counts, result.getPseudoInverse());
assertPseudoInverse(result.getReducedCounts(), result.getReducedPseudoInverse());
}
Aggregations