use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class Profiler method run.
public void run(String[] args) {
/* Variables */
DataSet resultDF;
String queryString;
/* Check command line arguments and get query to run. */
if ((queryString = checkCommandLineArgs(args)) == null) {
return;
}
/* Run query and get result */
log.info("[PROFILER-INFO] Analyzing profile statistics for: [{}]", queryString);
resultDF = sparkContextService.sql(sqlContext, queryString);
/* Get profile statistics and write to table */
final StatisticsModel statisticsModel = profiler.profile(resultDF, profilerConfiguration);
if (statisticsModel != null) {
OutputWriter.writeModel(statisticsModel, profilerConfiguration, sqlContext, sparkContextService);
} else {
log.info("[PROFILER-INFO] No data to process. Hence, no profile statistics generated.");
}
/* Wrap up */
log.info("[PROFILER-INFO] Profiling finished.");
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class StandardDataValidator method validateTable.
@Nonnull
@Override
public DataValidatorResult validateTable(@Nonnull final String databaseName, @Nonnull final String sourceTableName, @Nonnull final String targetTableName, @Nonnull final String partition, final int numPartitions, @Nonnull final Map<String, FieldPolicy> policyMap, @Nonnull final HiveContext hiveContext) {
// Extract fields from a source table
StructField[] fields = resolveSchema(databaseName, targetTableName, hiveContext);
FieldPolicy[] policies = resolvePolicies(fields, policyMap);
String selectStmt = toSelectFields(policies);
String sql = "SELECT " + selectStmt + " FROM " + HiveUtils.quoteIdentifier(databaseName, sourceTableName) + " WHERE processing_dttm = '" + partition + "'";
log.info("Executing query {}", sql);
DataSet sourceDF = scs.sql(hiveContext, sql);
// Repartition if necessary
if (numPartitions > 0) {
log.info("Partition count: {}", numPartitions);
sourceDF = sourceDF.repartition(numPartitions);
}
return validate(sourceDF, policies, fields);
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class Cleanup method doCleanup.
public void doCleanup() {
try {
SparkContext sparkContext = SparkContext.getOrCreate();
hiveContext = new org.apache.spark.sql.hive.HiveContext(sparkContext);
String sql = "SELECT COUNT(*) FROM " + categoryName + "." + feedName;
log.info("Executing query {}", sql);
DataSet dataFrame = scs.sql(getHiveContext(), sql);
List<Row> resultRows = dataFrame.collectAsList();
long rowCount = 0;
if (resultRows.size() > 0) {
rowCount = resultRows.get(0).getLong(0);
}
log.info("Total rows in {}.{}: {}", categoryName, feedName, rowCount);
} catch (Exception e) {
log.error("An error occurred during running cleanup: {}", e.getMessage());
System.exit(1);
}
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class DataQualityChecker method doDataQualityChecks.
public void doDataQualityChecks() {
try {
SparkContext sparkContext = SparkContext.getOrCreate();
hiveContext = new org.apache.spark.sql.hive.HiveContext(sparkContext);
String sql = "SELECT COUNT(*) FROM " + categoryName + "." + feedName;
log.info("Executing query {}", sql);
DataSet dataFrame = scs.sql(getHiveContext(), sql);
List<Row> resultRows = dataFrame.collectAsList();
long rowCount = 0;
if (resultRows.size() > 0) {
rowCount = resultRows.get(0).getLong(0);
}
log.info("Total rows in {}.{}: {}", categoryName, feedName, rowCount);
} catch (Exception e) {
log.error("An error occurred during running data quality checks: {}", e.getMessage());
System.exit(1);
}
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class TransformService method execute.
/**
* Executes the specified transformation and returns the name of the Hive table containing the results.
*
* @param request the transformation request
* @return the Hive table containing the results
* @throws IllegalStateException if this service is not running
* @throws ScriptException if the script cannot be executed
*/
@Nonnull
public TransformResponse execute(@Nonnull final TransformRequest request) throws ScriptException {
log.entry(request);
// Handle async request
if (request.isAsync()) {
return cacheTransform(request);
}
// Execute script
final DataSet dataSet = createShellTask(request);
final StructType schema = dataSet.schema();
TransformResponse response = submitTransformJob(new ShellTransformStage(dataSet), getPolicies(request));
// Build response
if (response.getStatus() != TransformResponse.Status.SUCCESS) {
final String table = response.getTable();
final TransformQueryResult partialResult = new TransformQueryResult();
partialResult.setColumns(Arrays.<QueryResultColumn>asList(new QueryResultRowTransform(schema, table).columns()));
response = new TransformResponse();
response.setProgress(0.0);
response.setResults(partialResult);
response.setStatus(TransformResponse.Status.PENDING);
response.setTable(table);
}
return log.exit(response);
}
Aggregations