use of org.apache.spark.api.java.JavaPairRDD in project cdap by caskdata.
the class WordCountSink method run.
@Override
public void run(SparkExecutionPluginContext sparkExecutionPluginContext, JavaRDD<StructuredRecord> javaRDD) throws Exception {
WordCount wordCount = new WordCount(config.field);
JavaPairRDD outputRDD = wordCount.countWords(javaRDD).mapToPair(new PairFunction<Tuple2<String, Long>, byte[], byte[]>() {
@Override
public Tuple2<byte[], byte[]> call(Tuple2<String, Long> stringLongTuple2) throws Exception {
return new Tuple2<>(Bytes.toBytes(stringLongTuple2._1()), Bytes.toBytes(stringLongTuple2._2()));
}
});
sparkExecutionPluginContext.saveAsDataset(outputRDD, config.tableName);
}
use of org.apache.spark.api.java.JavaPairRDD in project atlas-checks by osmlab.
the class IntegrityCheckSparkJob method start.
@SuppressWarnings({ "rawtypes" })
@Override
public void start(final CommandMap commandMap) {
final String atlasDirectory = (String) commandMap.get(ATLAS_FOLDER);
final String input = Optional.ofNullable(input(commandMap)).orElse(atlasDirectory);
final String output = output(commandMap);
@SuppressWarnings("unchecked") final Set<OutputFormats> outputFormats = (Set<OutputFormats>) commandMap.get(OUTPUT_FORMATS);
final StringList countries = StringList.split((String) commandMap.get(COUNTRIES), CommonConstants.COMMA);
final MapRouletteConfiguration mapRouletteConfiguration = (MapRouletteConfiguration) commandMap.get(MAP_ROULETTE);
final Configuration checksConfiguration = ConfigurationResolver.loadConfiguration(commandMap, CONFIGURATION_FILES, CONFIGURATION_JSON);
final boolean saveIntermediateAtlas = (Boolean) commandMap.get(PBF_SAVE_INTERMEDIATE_ATLAS);
@SuppressWarnings("unchecked") final Rectangle pbfBoundary = ((Optional<Rectangle>) commandMap.getOption(PBF_BOUNDING_BOX)).orElse(Rectangle.MAXIMUM);
final boolean compressOutput = Boolean.valueOf((String) commandMap.get(SparkJob.COMPRESS_OUTPUT));
final Map<String, String> sparkContext = configurationMap();
final CheckResourceLoader checkLoader = new CheckResourceLoader(checksConfiguration);
// check configuration and country list
final Set<BaseCheck> preOverriddenChecks = checkLoader.loadChecks();
if (!isValidInput(countries, preOverriddenChecks)) {
logger.error("No countries supplied or checks enabled, exiting!");
return;
}
// Read priority countries from the configuration
final List<String> priorityCountries = checksConfiguration.get("priority.countries", Collections.EMPTY_LIST).value();
// Create a list of Country to Check tuples
// Add priority countries first if they are supplied by parameter
final List<Tuple2<String, Set<BaseCheck>>> countryCheckTuples = new ArrayList<>();
countries.stream().filter(priorityCountries::contains).forEach(country -> countryCheckTuples.add(new Tuple2<>(country, checkLoader.loadChecksForCountry(country))));
// Then add the rest of the countries
countries.stream().filter(country -> !priorityCountries.contains(country)).forEach(country -> countryCheckTuples.add(new Tuple2<>(country, checkLoader.loadChecksForCountry(country))));
// Log countries and integrity
logger.info("Initialized countries: {}", countryCheckTuples.stream().map(tuple -> tuple._1).collect(Collectors.joining(",")));
logger.info("Initialized checks: {}", preOverriddenChecks.stream().map(BaseCheck::getCheckName).collect(Collectors.joining(",")));
// Parallelize on the countries
final JavaPairRDD<String, Set<BaseCheck>> countryCheckRDD = getContext().parallelizePairs(countryCheckTuples, countryCheckTuples.size());
// Set target and temporary folders
final String targetOutputFolder = SparkFileHelper.parentPath(output);
final String temporaryOutputFolder = SparkFileHelper.combine(targetOutputFolder, SparkFileHelper.TEMPORARY_FOLDER_NAME);
// Useful file helper to create/delete/name files and directories
final SparkFileHelper fileHelper = new SparkFileHelper(sparkContext);
// Atlas Helper to load different types of Atlas data
final AtlasDataSource atlasLoader = new AtlasDataSource(sparkContext, checksConfiguration, pbfBoundary);
// Create target folders
fileHelper.mkdir(SparkFileHelper.combine(targetOutputFolder, OUTPUT_FLAG_FOLDER));
fileHelper.mkdir(SparkFileHelper.combine(targetOutputFolder, OUTPUT_GEOJSON_FOLDER));
fileHelper.mkdir(SparkFileHelper.combine(targetOutputFolder, OUTPUT_METRIC_FOLDER));
// Run the set of flags per country per check. The output will be an RDD pair mapping each
// country with a set of SparkFilePaths to flags, geojson and metrics generated.
final JavaPairRDD<String, Set<SparkFilePath>> resultRDD = countryCheckRDD.mapToPair(tuple -> {
final Time timer = Time.now();
final String country = tuple._1();
final Set<BaseCheck> checks = tuple._2();
logger.info("Initialized checks for {}: {}", country, checks.stream().map(BaseCheck::getCheckName).collect(Collectors.joining(",")));
final Set<SparkFilePath> resultingFiles = new HashSet<>();
final SparkFilePath flagOutput;
if (outputFormats.contains(OutputFormats.FLAGS)) {
// Initialize flag output processor
flagOutput = initializeOutput(OUTPUT_FLAG_FOLDER, TaskContext.get(), country, temporaryOutputFolder, targetOutputFolder);
EventService.get(country).register(new CheckFlagFileProcessor(fileHelper, flagOutput.getTemporaryPath()).withCompression(compressOutput));
} else {
flagOutput = null;
}
final SparkFilePath geoJsonOutput;
if (outputFormats.contains(OutputFormats.GEOJSON)) {
// Initialize geojson output processor
geoJsonOutput = initializeOutput(OUTPUT_GEOJSON_FOLDER, TaskContext.get(), country, temporaryOutputFolder, targetOutputFolder);
EventService.get(country).register(new CheckFlagGeoJsonProcessor(fileHelper, geoJsonOutput.getTemporaryPath()).withCompression(compressOutput));
} else {
geoJsonOutput = null;
}
final SparkFilePath metricOutput;
if (outputFormats.contains(OutputFormats.METRICS)) {
// Initialize metric output processor
metricOutput = initializeOutput(OUTPUT_METRIC_FOLDER, TaskContext.get(), country, temporaryOutputFolder, targetOutputFolder);
EventService.get(country).register(new MetricFileGenerator(METRICS_FILENAME, fileHelper, metricOutput.getTemporaryPath()));
} else {
metricOutput = null;
}
final Consumer<Atlas> intermediateAtlasHandler;
if (saveIntermediateAtlas) {
final SparkFilePath atlasOutput = initializeOutput(OUTPUT_ATLAS_FOLDER, TaskContext.get(), country, temporaryOutputFolder, targetOutputFolder);
intermediateAtlasHandler = atlas -> {
writeAtlas(atlas, country, atlasOutput, fileHelper);
resultingFiles.add(atlasOutput);
};
} else {
intermediateAtlasHandler = atlas -> {
// no-op
};
}
try {
final Atlas atlas = atlasLoader.load(input, country, intermediateAtlasHandler);
if (atlas == null) {
logger.error("Could not find {} Atlas files. Skipping country!", country);
} else {
executeChecks(country, atlas, checks, mapRouletteConfiguration);
// Add output folders for handling later
Stream.of(flagOutput, metricOutput, geoJsonOutput).filter(Objects::nonNull).forEach(resultingFiles::add);
}
EventService.get(country).complete();
return new Tuple2<>(country, resultingFiles);
} catch (final CoreException e) {
logger.error("Exception running integrity checks on {}", country, e);
}
logger.trace("Integrity checks took {} ms to execute for {}.", timer.elapsedSince().asMilliseconds(), country);
return new Tuple2<>(IGNORED_KEY, null);
}).filter(tuple -> !tuple._1().equals(IGNORED_KEY));
// Commit results
resultRDD.foreach(countryPathPair -> {
final String country = countryPathPair._1();
final Set<SparkFilePath> paths = countryPathPair._2();
logger.info("[{}] Committing outputs: {}", country, paths);
paths.forEach(fileHelper::commit);
});
try {
// Clean up
logger.info("Deleting {}.", temporaryOutputFolder);
fileHelper.deleteDirectory(temporaryOutputFolder);
} catch (final Exception e) {
logger.warn("Clean up failed!", e);
}
}
use of org.apache.spark.api.java.JavaPairRDD in project presto by prestodb.
the class PrestoSparkRddFactory method createRdd.
private <T extends PrestoSparkTaskOutput> JavaPairRDD<MutablePartitionId, T> createRdd(JavaSparkContext sparkContext, Session session, PlanFragment fragment, PrestoSparkTaskExecutorFactoryProvider executorFactoryProvider, CollectionAccumulator<SerializedTaskInfo> taskInfoCollector, CollectionAccumulator<PrestoSparkShuffleStats> shuffleStatsCollector, TableWriteInfo tableWriteInfo, Map<PlanFragmentId, JavaPairRDD<MutablePartitionId, PrestoSparkMutableRow>> rddInputs, Map<PlanFragmentId, Broadcast<?>> broadcastInputs, Class<T> outputType) {
checkInputs(fragment.getRemoteSourceNodes(), rddInputs, broadcastInputs);
PrestoSparkTaskDescriptor taskDescriptor = new PrestoSparkTaskDescriptor(session.toSessionRepresentation(), session.getIdentity().getExtraCredentials(), fragment, tableWriteInfo);
SerializedPrestoSparkTaskDescriptor serializedTaskDescriptor = new SerializedPrestoSparkTaskDescriptor(taskDescriptorJsonCodec.toJsonBytes(taskDescriptor));
Optional<Integer> numberOfShufflePartitions = Optional.empty();
Map<String, RDD<Tuple2<MutablePartitionId, PrestoSparkMutableRow>>> shuffleInputRddMap = new HashMap<>();
for (Map.Entry<PlanFragmentId, JavaPairRDD<MutablePartitionId, PrestoSparkMutableRow>> input : rddInputs.entrySet()) {
RDD<Tuple2<MutablePartitionId, PrestoSparkMutableRow>> rdd = input.getValue().rdd();
shuffleInputRddMap.put(input.getKey().toString(), rdd);
if (!numberOfShufflePartitions.isPresent()) {
numberOfShufflePartitions = Optional.of(rdd.getNumPartitions());
} else {
checkArgument(numberOfShufflePartitions.get() == rdd.getNumPartitions(), "Incompatible number of input partitions: %s != %s", numberOfShufflePartitions.get(), rdd.getNumPartitions());
}
}
PrestoSparkTaskProcessor<T> taskProcessor = new PrestoSparkTaskProcessor<>(executorFactoryProvider, serializedTaskDescriptor, taskInfoCollector, shuffleStatsCollector, toTaskProcessorBroadcastInputs(broadcastInputs), outputType);
Optional<PrestoSparkTaskSourceRdd> taskSourceRdd;
List<TableScanNode> tableScans = findTableScanNodes(fragment.getRoot());
if (!tableScans.isEmpty()) {
try (CloseableSplitSourceProvider splitSourceProvider = new CloseableSplitSourceProvider(splitManager::getSplits)) {
SplitSourceFactory splitSourceFactory = new SplitSourceFactory(splitSourceProvider, WarningCollector.NOOP);
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(fragment, session, tableWriteInfo);
taskSourceRdd = Optional.of(createTaskSourcesRdd(fragment.getId(), sparkContext, session, fragment.getPartitioning(), tableScans, splitSources, numberOfShufflePartitions));
}
} else if (rddInputs.size() == 0) {
checkArgument(fragment.getPartitioning().equals(SINGLE_DISTRIBUTION), "SINGLE_DISTRIBUTION partitioning is expected: %s", fragment.getPartitioning());
// In case of no inputs we still need to schedule a task.
// Task with no inputs may produce results (e.g.: ValuesNode).
// To force the task to be scheduled we create a PrestoSparkTaskSourceRdd that contains exactly one partition.
// Since there's also no table scans in the fragment, the list of TaskSource's for this partition is empty.
taskSourceRdd = Optional.of(new PrestoSparkTaskSourceRdd(sparkContext.sc(), ImmutableList.of(ImmutableList.of())));
} else {
taskSourceRdd = Optional.empty();
}
return JavaPairRDD.fromRDD(PrestoSparkTaskRdd.create(sparkContext.sc(), taskSourceRdd, shuffleInputRddMap, taskProcessor), classTag(MutablePartitionId.class), classTag(outputType));
}
use of org.apache.spark.api.java.JavaPairRDD in project learning-spark by databricks.
the class LogAnalyzerWindowed method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {
public String call(ApacheAccessLog entry) {
return entry.getIpAddress();
}
});
// reduceByWindow
JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {
public Long call(ApacheAccessLog entry) {
return 1L;
}
}).reduceByWindow(new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 + v2;
}
}, new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 - v2;
}
}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
requestCountRBW.print();
// reducebykeyandwindow
JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {
public Tuple2<String, Long> call(ApacheAccessLog entry) {
return new Tuple2(entry.getIpAddress(), 1L);
}
});
JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 + v2;
}
}, // Removing elements from the oldest slice
new Function2<Long, Long, Long>() {
public Long call(Long v1, Long v2) {
return v1 - v2;
}
}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
ipCountDStream.print();
// Use countByWindow
JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
requestCount.print();
ipAddressRequestCount.print();
// use a transform for the response code count
JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
return Functions.responseCodeCount(logs);
}
});
windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
Object ordering = Ordering.natural();
Comparator<Long> cmp = (Comparator<Long>) ordering;
List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
return null;
}
});
}
use of org.apache.spark.api.java.JavaPairRDD in project learning-spark by databricks.
the class LogAnalyzerTotal method processAccessLogs.
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
// Calculate statistics based on the content size, and update the static variables to track this.
accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
if (stats != null) {
runningCount.getAndAdd(stats._1());
runningSum.getAndAdd(stats._2());
runningMin.set(Math.min(runningMin.get(), stats._3()));
runningMax.set(Math.max(runningMax.get(), stats._4()));
}
return null;
}
});
// A DStream of Resonse Code Counts;
JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.responseCodeCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {
public Void call(JavaPairRDD<Integer, Long> rdd) {
currentResponseCodeCounts = rdd.take(100);
return null;
}
});
// A DStream of ipAddressCounts.
JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.ipAddressCount(rdd);
}
});
JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
// A DStream of ipAddressCounts without transform
JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
// and joining it with the transfer amount
JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
// Save our dstream of ip address request counts
JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {
public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
}
});
class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
}
;
writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
// All ips more than 10
JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {
public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
return Functions.filterIPAddress(rdd);
}
});
ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {
public Void call(JavaRDD<String> rdd) {
List<String> currentIPAddresses = rdd.take(100);
return null;
}
});
// A DStream of endpoint to count.
JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {
public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
return Functions.endpointCount(rdd);
}
}).updateStateByKey(new Functions.ComputeRunningSum());
Object ordering = Ordering.natural();
final Comparator<Long> cmp = (Comparator<Long>) ordering;
endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {
public Void call(JavaPairRDD<String, Long> rdd) {
currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
return null;
}
});
}
Aggregations