use of com.thinkbiganalytics.kylo.catalog.spark.sources.spark.SparkDataSetContext in project kylo by Teradata.
the class AbstractSparkDataSetProvider method read.
@Nonnull
@Override
public final T read(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options) {
// Prepare reader
final SparkDataSetContext<T> context = new SparkDataSetContext<>(options, client, this);
final DataFrameReader reader = SparkUtil.prepareDataFrameReader(getDataFrameReader(client, context), context, client);
final List<String> paths = context.getPaths();
// Load and union data sets
T dataSet = null;
if (paths == null || paths.isEmpty() || context.isFileFormat()) {
final Seq<String> pathSeq = (paths != null) ? JavaConversions.asScalaBuffer(paths) : null;
dataSet = load(reader, pathSeq);
} else {
for (final String path : paths) {
T load = load(reader, Seq$.MODULE$.<String>newBuilder().$plus$eq(path).result());
dataSet = (dataSet == null) ? load : union(dataSet, load);
}
}
// Delete files on job end
if (context.isFileFormat() && "false".equalsIgnoreCase(SparkUtil.getOrElse(context.getOption(KEEP_SOURCE_FILE_OPTION), ""))) {
final StructType schema = schema(dataSet);
if (!schema.isEmpty()) {
// Watch for when data set is read
final Accumulable<Boolean, Boolean> accumulator = accumulable(Boolean.FALSE, UUID.randomUUID().toString(), new BooleanOrAccumulatorParam(), client);
final FlaggingVisitor visitor = new FlaggingVisitor(accumulator);
dataSet = map(dataSet, schema.apply(0).name(), visitor, schema.apply(0).dataType());
// Delete paths on job end
final DeletePathsListener jobListener = new DeletePathsListener(context.getPaths(), accumulator, getHadoopConfiguration(client));
onJobEnd(jobListener, client);
}
}
return dataSet;
}
Aggregations