use of org.apache.spark.sql.DataFrameReader in project kylo by Teradata.
the class AbstractJdbcDataSetProvider method read.
@Nonnull
@Override
public final T read(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options) {
// Set url for PostgreSQL databases
final Option<String> catalog = options.getOption("PGDBNAME");
final Option<String> url = options.getOption("url");
if (catalog.isDefined() && url.isDefined() && url.get().startsWith("jdbc:postgres://")) {
final String[] urlSplit = url.get().split("\\?", 2);
final String[] pathSplit = urlSplit[0].substring(16).split("/", 2);
if (pathSplit.length == 1 || StringUtils.equalsAny(pathSplit[1], "", "/")) {
String catalogUrl = "jdbc:postgres://" + pathSplit[0] + "/" + urlEncode(catalog.get());
if (urlSplit.length == 2) {
catalogUrl += "?" + urlSplit[1];
}
options.setOption("url", catalogUrl);
}
}
// Load data set
final DataFrameReader reader = SparkUtil.prepareDataFrameReader(getDataFrameReader(client, options), options, null);
reader.format(JdbcRelationProvider.class.getName());
T dataSet = load(reader);
// Handle high water mark
final String dateField = SparkUtil.getOrElse(options.getOption(DATE_FIELD_OPTION), null);
final String highWaterMarkKey = SparkUtil.getOrElse(options.getOption(HIGH_WATER_MARK_OPTION), null);
final Long overlap = getOverlap(options);
if (dateField != null && highWaterMarkKey != null) {
final JdbcHighWaterMark initialValue = createHighWaterMark(highWaterMarkKey, client);
dataSet = filterByDateTime(dataSet, dateField, initialValue.getValue(), overlap);
dataSet = updateHighWaterMark(dataSet, dateField, initialValue, client);
} else if (highWaterMarkKey != null) {
log.warn("Ignoring '{}' option because '{}' option was not specified", HIGH_WATER_MARK_OPTION, DATE_FIELD_OPTION);
} else if (overlap != null) {
log.warn("Ignoring '{}' option because '{}' and '{}' options were not specified", OVERLAP_OPTION, DATE_FIELD_OPTION, HIGH_WATER_MARK_OPTION);
}
return dataSet;
}
use of org.apache.spark.sql.DataFrameReader in project kylo by Teradata.
the class AbstractSparkDataSetProvider method read.
@Nonnull
@Override
public final T read(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options) {
// Prepare reader
final SparkDataSetContext<T> context = new SparkDataSetContext<>(options, client, this);
final DataFrameReader reader = SparkUtil.prepareDataFrameReader(getDataFrameReader(client, context), context, client);
final List<String> paths = context.getPaths();
// Load and union data sets
T dataSet = null;
if (paths == null || paths.isEmpty() || context.isFileFormat()) {
final Seq<String> pathSeq = (paths != null) ? JavaConversions.asScalaBuffer(paths) : null;
dataSet = load(reader, pathSeq);
} else {
for (final String path : paths) {
T load = load(reader, Seq$.MODULE$.<String>newBuilder().$plus$eq(path).result());
dataSet = (dataSet == null) ? load : union(dataSet, load);
}
}
// Delete files on job end
if (context.isFileFormat() && "false".equalsIgnoreCase(SparkUtil.getOrElse(context.getOption(KEEP_SOURCE_FILE_OPTION), ""))) {
final StructType schema = schema(dataSet);
if (!schema.isEmpty()) {
// Watch for when data set is read
final Accumulable<Boolean, Boolean> accumulator = accumulable(Boolean.FALSE, UUID.randomUUID().toString(), new BooleanOrAccumulatorParam(), client);
final FlaggingVisitor visitor = new FlaggingVisitor(accumulator);
dataSet = map(dataSet, schema.apply(0).name(), visitor, schema.apply(0).dataType());
// Delete paths on job end
final DeletePathsListener jobListener = new DeletePathsListener(context.getPaths(), accumulator, getHadoopConfiguration(client));
onJobEnd(jobListener, client);
}
}
return dataSet;
}
Aggregations