use of co.cask.cdap.etl.spark.SparkStageStatisticsCollector in project cdap by caskdata.
the class BatchSparkPipelineDriver method updateWorkflowToken.
private void updateWorkflowToken(WorkflowToken token, Map<String, StageStatisticsCollector> collectors) {
for (Map.Entry<String, StageStatisticsCollector> entry : collectors.entrySet()) {
SparkStageStatisticsCollector collector = (SparkStageStatisticsCollector) entry.getValue();
String keyPrefix = Constants.StageStatistics.PREFIX + "." + entry.getKey() + ".";
String inputRecordKey = keyPrefix + Constants.StageStatistics.INPUT_RECORDS;
token.put(inputRecordKey, String.valueOf(collector.getInputRecordCount()));
String outputRecordKey = keyPrefix + Constants.StageStatistics.OUTPUT_RECORDS;
token.put(outputRecordKey, String.valueOf(collector.getOutputRecordCount()));
String errorRecordKey = keyPrefix + Constants.StageStatistics.ERROR_RECORDS;
token.put(errorRecordKey, String.valueOf(collector.getErrorRecordCount()));
}
}
use of co.cask.cdap.etl.spark.SparkStageStatisticsCollector in project cdap by caskdata.
the class BatchSparkPipelineDriver method run.
@Override
public void run(DatasetContext context) throws Exception {
BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
String object = reader.readLine();
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
stagePartitions = sourceSinkInfo.getStagePartitions();
}
datasetContext = context;
numOfRecordsPreview = phaseSpec.getNumOfRecordsPreview();
PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
Map<String, StageStatisticsCollector> collectors = new HashMap<>();
if (phaseSpec.pipelineContainsCondition()) {
Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
while (iterator.hasNext()) {
StageSpec spec = iterator.next();
collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
}
}
try {
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
runPipeline(phaseSpec.getPhase(), BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors);
} finally {
updateWorkflowToken(sec.getWorkflowToken(), collectors);
}
}
Aggregations