Search in sources :

Example 1 with DataFrameReader

use of org.apache.spark.sql.DataFrameReader in project kylo by Teradata.

the class AbstractJdbcDataSetProvider method read.

@Nonnull
@Override
public final T read(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options) {
    // Set url for PostgreSQL databases
    final Option<String> catalog = options.getOption("PGDBNAME");
    final Option<String> url = options.getOption("url");
    if (catalog.isDefined() && url.isDefined() && url.get().startsWith("jdbc:postgres://")) {
        final String[] urlSplit = url.get().split("\\?", 2);
        final String[] pathSplit = urlSplit[0].substring(16).split("/", 2);
        if (pathSplit.length == 1 || StringUtils.equalsAny(pathSplit[1], "", "/")) {
            String catalogUrl = "jdbc:postgres://" + pathSplit[0] + "/" + urlEncode(catalog.get());
            if (urlSplit.length == 2) {
                catalogUrl += "?" + urlSplit[1];
            }
            options.setOption("url", catalogUrl);
        }
    }
    // Load data set
    final DataFrameReader reader = SparkUtil.prepareDataFrameReader(getDataFrameReader(client, options), options, null);
    reader.format(JdbcRelationProvider.class.getName());
    T dataSet = load(reader);
    // Handle high water mark
    final String dateField = SparkUtil.getOrElse(options.getOption(DATE_FIELD_OPTION), null);
    final String highWaterMarkKey = SparkUtil.getOrElse(options.getOption(HIGH_WATER_MARK_OPTION), null);
    final Long overlap = getOverlap(options);
    if (dateField != null && highWaterMarkKey != null) {
        final JdbcHighWaterMark initialValue = createHighWaterMark(highWaterMarkKey, client);
        dataSet = filterByDateTime(dataSet, dateField, initialValue.getValue(), overlap);
        dataSet = updateHighWaterMark(dataSet, dateField, initialValue, client);
    } else if (highWaterMarkKey != null) {
        log.warn("Ignoring '{}' option because '{}' option was not specified", HIGH_WATER_MARK_OPTION, DATE_FIELD_OPTION);
    } else if (overlap != null) {
        log.warn("Ignoring '{}' option because '{}' and '{}' options were not specified", OVERLAP_OPTION, DATE_FIELD_OPTION, HIGH_WATER_MARK_OPTION);
    }
    return dataSet;
}
Also used : JdbcRelationProvider(com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcRelationProvider) JdbcHighWaterMark(com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMark) DataFrameReader(org.apache.spark.sql.DataFrameReader) Nonnull(javax.annotation.Nonnull)

Example 2 with DataFrameReader

use of org.apache.spark.sql.DataFrameReader in project kylo by Teradata.

the class AbstractSparkDataSetProvider method read.

@Nonnull
@Override
public final T read(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options) {
    // Prepare reader
    final SparkDataSetContext<T> context = new SparkDataSetContext<>(options, client, this);
    final DataFrameReader reader = SparkUtil.prepareDataFrameReader(getDataFrameReader(client, context), context, client);
    final List<String> paths = context.getPaths();
    // Load and union data sets
    T dataSet = null;
    if (paths == null || paths.isEmpty() || context.isFileFormat()) {
        final Seq<String> pathSeq = (paths != null) ? JavaConversions.asScalaBuffer(paths) : null;
        dataSet = load(reader, pathSeq);
    } else {
        for (final String path : paths) {
            T load = load(reader, Seq$.MODULE$.<String>newBuilder().$plus$eq(path).result());
            dataSet = (dataSet == null) ? load : union(dataSet, load);
        }
    }
    // Delete files on job end
    if (context.isFileFormat() && "false".equalsIgnoreCase(SparkUtil.getOrElse(context.getOption(KEEP_SOURCE_FILE_OPTION), ""))) {
        final StructType schema = schema(dataSet);
        if (!schema.isEmpty()) {
            // Watch for when data set is read
            final Accumulable<Boolean, Boolean> accumulator = accumulable(Boolean.FALSE, UUID.randomUUID().toString(), new BooleanOrAccumulatorParam(), client);
            final FlaggingVisitor visitor = new FlaggingVisitor(accumulator);
            dataSet = map(dataSet, schema.apply(0).name(), visitor, schema.apply(0).dataType());
            // Delete paths on job end
            final DeletePathsListener jobListener = new DeletePathsListener(context.getPaths(), accumulator, getHadoopConfiguration(client));
            onJobEnd(jobListener, client);
        }
    }
    return dataSet;
}
Also used : SparkDataSetContext(com.thinkbiganalytics.kylo.catalog.spark.sources.spark.SparkDataSetContext) StructType(org.apache.spark.sql.types.StructType) DataFrameReader(org.apache.spark.sql.DataFrameReader) DeletePathsListener(com.thinkbiganalytics.kylo.catalog.spark.sources.spark.DeletePathsListener) FlaggingVisitor(com.thinkbiganalytics.kylo.catalog.spark.sources.spark.FlaggingVisitor) BooleanOrAccumulatorParam(com.thinkbiganalytics.kylo.catalog.spark.sources.spark.BooleanOrAccumulatorParam) Nonnull(javax.annotation.Nonnull)

Aggregations

Nonnull (javax.annotation.Nonnull)2 DataFrameReader (org.apache.spark.sql.DataFrameReader)2 JdbcHighWaterMark (com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMark)1 JdbcRelationProvider (com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcRelationProvider)1 BooleanOrAccumulatorParam (com.thinkbiganalytics.kylo.catalog.spark.sources.spark.BooleanOrAccumulatorParam)1 DeletePathsListener (com.thinkbiganalytics.kylo.catalog.spark.sources.spark.DeletePathsListener)1 FlaggingVisitor (com.thinkbiganalytics.kylo.catalog.spark.sources.spark.FlaggingVisitor)1 SparkDataSetContext (com.thinkbiganalytics.kylo.catalog.spark.sources.spark.SparkDataSetContext)1 StructType (org.apache.spark.sql.types.StructType)1