Search in sources :

Example 6 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class Profiler method run.

public void run(String[] args) {
    /* Variables */
    DataSet resultDF;
    String queryString;
    /* Check command line arguments and get query to run. */
    if ((queryString = checkCommandLineArgs(args)) == null) {
        return;
    }
    /* Run query and get result */
    log.info("[PROFILER-INFO] Analyzing profile statistics for: [{}]", queryString);
    resultDF = sparkContextService.sql(sqlContext, queryString);
    /* Get profile statistics and write to table */
    final StatisticsModel statisticsModel = profiler.profile(resultDF, profilerConfiguration);
    if (statisticsModel != null) {
        OutputWriter.writeModel(statisticsModel, profilerConfiguration, sqlContext, sparkContextService);
    } else {
        log.info("[PROFILER-INFO] No data to process. Hence, no profile statistics generated.");
    }
    /* Wrap up */
    log.info("[PROFILER-INFO] Profiling finished.");
}
Also used : StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) DataSet(com.thinkbiganalytics.spark.DataSet)

Example 7 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class StandardDataValidator method validateTable.

@Nonnull
@Override
public DataValidatorResult validateTable(@Nonnull final String databaseName, @Nonnull final String sourceTableName, @Nonnull final String targetTableName, @Nonnull final String partition, final int numPartitions, @Nonnull final Map<String, FieldPolicy> policyMap, @Nonnull final HiveContext hiveContext) {
    // Extract fields from a source table
    StructField[] fields = resolveSchema(databaseName, targetTableName, hiveContext);
    FieldPolicy[] policies = resolvePolicies(fields, policyMap);
    String selectStmt = toSelectFields(policies);
    String sql = "SELECT " + selectStmt + " FROM " + HiveUtils.quoteIdentifier(databaseName, sourceTableName) + " WHERE processing_dttm = '" + partition + "'";
    log.info("Executing query {}", sql);
    DataSet sourceDF = scs.sql(hiveContext, sql);
    // Repartition if necessary
    if (numPartitions > 0) {
        log.info("Partition count: {}", numPartitions);
        sourceDF = sourceDF.repartition(numPartitions);
    }
    return validate(sourceDF, policies, fields);
}
Also used : StructField(org.apache.spark.sql.types.StructField) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) DataSet(com.thinkbiganalytics.spark.DataSet) Nonnull(javax.annotation.Nonnull)

Example 8 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class Cleanup method doCleanup.

public void doCleanup() {
    try {
        SparkContext sparkContext = SparkContext.getOrCreate();
        hiveContext = new org.apache.spark.sql.hive.HiveContext(sparkContext);
        String sql = "SELECT COUNT(*) FROM " + categoryName + "." + feedName;
        log.info("Executing query {}", sql);
        DataSet dataFrame = scs.sql(getHiveContext(), sql);
        List<Row> resultRows = dataFrame.collectAsList();
        long rowCount = 0;
        if (resultRows.size() > 0) {
            rowCount = resultRows.get(0).getLong(0);
        }
        log.info("Total rows in {}.{}: {}", categoryName, feedName, rowCount);
    } catch (Exception e) {
        log.error("An error occurred during running cleanup: {}", e.getMessage());
        System.exit(1);
    }
}
Also used : SparkContext(org.apache.spark.SparkContext) DataSet(com.thinkbiganalytics.spark.DataSet) HiveContext(org.apache.spark.sql.hive.HiveContext) Row(org.apache.spark.sql.Row)

Example 9 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class DataQualityChecker method doDataQualityChecks.

public void doDataQualityChecks() {
    try {
        SparkContext sparkContext = SparkContext.getOrCreate();
        hiveContext = new org.apache.spark.sql.hive.HiveContext(sparkContext);
        String sql = "SELECT COUNT(*) FROM " + categoryName + "." + feedName;
        log.info("Executing query {}", sql);
        DataSet dataFrame = scs.sql(getHiveContext(), sql);
        List<Row> resultRows = dataFrame.collectAsList();
        long rowCount = 0;
        if (resultRows.size() > 0) {
            rowCount = resultRows.get(0).getLong(0);
        }
        log.info("Total rows in {}.{}: {}", categoryName, feedName, rowCount);
    } catch (Exception e) {
        log.error("An error occurred during running data quality checks: {}", e.getMessage());
        System.exit(1);
    }
}
Also used : SparkContext(org.apache.spark.SparkContext) DataSet(com.thinkbiganalytics.spark.DataSet) HiveContext(org.apache.spark.sql.hive.HiveContext) Row(org.apache.spark.sql.Row)

Example 10 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class TransformService method execute.

/**
 * Executes the specified transformation and returns the name of the Hive table containing the results.
 *
 * @param request the transformation request
 * @return the Hive table containing the results
 * @throws IllegalStateException if this service is not running
 * @throws ScriptException       if the script cannot be executed
 */
@Nonnull
public TransformResponse execute(@Nonnull final TransformRequest request) throws ScriptException {
    log.entry(request);
    // Handle async request
    if (request.isAsync()) {
        return cacheTransform(request);
    }
    // Execute script
    final DataSet dataSet = createShellTask(request);
    final StructType schema = dataSet.schema();
    TransformResponse response = submitTransformJob(new ShellTransformStage(dataSet), getPolicies(request));
    // Build response
    if (response.getStatus() != TransformResponse.Status.SUCCESS) {
        final String table = response.getTable();
        final TransformQueryResult partialResult = new TransformQueryResult();
        partialResult.setColumns(Arrays.<QueryResultColumn>asList(new QueryResultRowTransform(schema, table).columns()));
        response = new TransformResponse();
        response.setProgress(0.0);
        response.setResults(partialResult);
        response.setStatus(TransformResponse.Status.PENDING);
        response.setTable(table);
    }
    return log.exit(response);
}
Also used : QueryResultRowTransform(com.thinkbiganalytics.spark.metadata.QueryResultRowTransform) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) ShellTransformStage(com.thinkbiganalytics.spark.metadata.ShellTransformStage) TransformQueryResult(com.thinkbiganalytics.spark.rest.model.TransformQueryResult) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) Nonnull(javax.annotation.Nonnull)

Aggregations

DataSet (com.thinkbiganalytics.spark.DataSet)14 StructType (org.apache.spark.sql.types.StructType)7 Nonnull (javax.annotation.Nonnull)4 SparkContext (org.apache.spark.SparkContext)4 TransformResponse (com.thinkbiganalytics.spark.rest.model.TransformResponse)3 Row (org.apache.spark.sql.Row)3 NamedParam (scala.tools.nsc.interpreter.NamedParam)3 SparkContextService (com.thinkbiganalytics.spark.SparkContextService)2 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)2 ShellTransformStage (com.thinkbiganalytics.spark.metadata.ShellTransformStage)2 SparkScriptEngine (com.thinkbiganalytics.spark.repl.SparkScriptEngine)2 TransformRequest (com.thinkbiganalytics.spark.rest.model.TransformRequest)2 DatasourceProvider (com.thinkbiganalytics.spark.shell.DatasourceProvider)2 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 HiveContext (org.apache.spark.sql.hive.HiveContext)2 StructField (org.apache.spark.sql.types.StructField)2 StorageLevel (org.apache.spark.storage.StorageLevel)2 Test (org.junit.Test)2