use of org.apache.nifi.util.hive.CsvOutputOptions in project nifi by apache.
the class SelectHiveQL method onTrigger.
private void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile fileToProcess = (context.hasIncomingConnection() ? session.get() : null);
FlowFile flowfile = null;
// we know that we should run only if we have a FlowFile.
if (context.hasIncomingConnection()) {
if (fileToProcess == null && context.hasNonLoopConnection()) {
return;
}
}
final ComponentLog logger = getLogger();
final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
final boolean flowbased = !(context.getProperty(HIVEQL_SELECT_QUERY).isSet());
// Source the SQL
final String selectQuery;
if (context.getProperty(HIVEQL_SELECT_QUERY).isSet()) {
selectQuery = context.getProperty(HIVEQL_SELECT_QUERY).evaluateAttributeExpressions(fileToProcess).getValue();
} else {
// If the query is not set, then an incoming flow file is required, and expected to contain a valid SQL select query.
// If there is no incoming connection, onTrigger will not be called as the processor will fail when scheduled.
final StringBuilder queryContents = new StringBuilder();
session.read(fileToProcess, in -> queryContents.append(IOUtils.toString(in, charset)));
selectQuery = queryContents.toString();
}
final Integer fetchSize = context.getProperty(FETCH_SIZE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxRowsPerFlowFile = context.getProperty(MAX_ROWS_PER_FLOW_FILE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxFragments = context.getProperty(MAX_FRAGMENTS).isSet() ? context.getProperty(MAX_FRAGMENTS).evaluateAttributeExpressions(fileToProcess).asInteger() : 0;
final String outputFormat = context.getProperty(HIVEQL_OUTPUT_FORMAT).getValue();
final boolean convertNamesForAvro = context.getProperty(NORMALIZE_NAMES_FOR_AVRO).asBoolean();
final StopWatch stopWatch = new StopWatch(true);
final boolean header = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String altHeader = context.getProperty(HIVEQL_CSV_ALT_HEADER).evaluateAttributeExpressions(fileToProcess).getValue();
final String delimiter = context.getProperty(HIVEQL_CSV_DELIMITER).evaluateAttributeExpressions(fileToProcess).getValue();
final boolean quote = context.getProperty(HIVEQL_CSV_QUOTE).asBoolean();
final boolean escape = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String fragmentIdentifier = UUID.randomUUID().toString();
try (final Connection con = dbcpService.getConnection();
final Statement st = (flowbased ? con.prepareStatement(selectQuery) : con.createStatement())) {
if (fetchSize != null && fetchSize > 0) {
try {
st.setFetchSize(fetchSize);
} catch (SQLException se) {
// Not all drivers support this, just log the error (at debug level) and move on
logger.debug("Cannot set fetch size to {} due to {}", new Object[] { fetchSize, se.getLocalizedMessage() }, se);
}
}
final List<FlowFile> resultSetFlowFiles = new ArrayList<>();
try {
logger.debug("Executing query {}", new Object[] { selectQuery });
if (flowbased) {
// Hive JDBC Doesn't Support this yet:
// ParameterMetaData pmd = ((PreparedStatement)st).getParameterMetaData();
// int paramCount = pmd.getParameterCount();
// Alternate way to determine number of params in SQL.
int paramCount = StringUtils.countMatches(selectQuery, "?");
if (paramCount > 0) {
setParameters(1, (PreparedStatement) st, paramCount, fileToProcess.getAttributes());
}
}
final ResultSet resultSet;
try {
resultSet = (flowbased ? ((PreparedStatement) st).executeQuery() : st.executeQuery(selectQuery));
} catch (SQLException se) {
// If an error occurs during the query, a flowfile is expected to be routed to failure, so ensure one here
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw se;
}
int fragmentIndex = 0;
String baseFilename = (fileToProcess != null) ? fileToProcess.getAttribute(CoreAttributes.FILENAME.key()) : null;
while (true) {
final AtomicLong nrOfRows = new AtomicLong(0L);
flowfile = (flowfile == null) ? session.create() : session.create(flowfile);
if (baseFilename == null) {
baseFilename = flowfile.getAttribute(CoreAttributes.FILENAME.key());
}
try {
flowfile = session.write(flowfile, out -> {
try {
if (AVRO.equals(outputFormat)) {
nrOfRows.set(HiveJdbcCommon.convertToAvroStream(resultSet, out, maxRowsPerFlowFile, convertNamesForAvro));
} else if (CSV.equals(outputFormat)) {
CsvOutputOptions options = new CsvOutputOptions(header, altHeader, delimiter, quote, escape, maxRowsPerFlowFile);
nrOfRows.set(HiveJdbcCommon.convertToCsvStream(resultSet, out, options));
} else {
nrOfRows.set(0L);
throw new ProcessException("Unsupported output format: " + outputFormat);
}
} catch (final SQLException | RuntimeException e) {
throw new ProcessException("Error during database query or conversion of records.", e);
}
});
} catch (ProcessException e) {
// Add flowfile to results before rethrowing so it will be removed from session in outer catch
resultSetFlowFiles.add(flowfile);
throw e;
}
if (nrOfRows.get() > 0 || resultSetFlowFiles.isEmpty()) {
final Map<String, String> attributes = new HashMap<>();
// Set attribute for how many rows were selected
attributes.put(RESULT_ROW_COUNT, String.valueOf(nrOfRows.get()));
try {
// Set input/output table names by parsing the query
attributes.putAll(toQueryTableAttributes(findTableNames(selectQuery)));
} catch (Exception e) {
// If failed to parse the query, just log a warning message, but continue.
getLogger().warn("Failed to parse query: {} due to {}", new Object[] { selectQuery, e }, e);
}
// Set MIME type on output document and add extension to filename
if (AVRO.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), MIME_TYPE_AVRO_BINARY);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".avro");
} else if (CSV.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".csv");
}
if (maxRowsPerFlowFile > 0) {
attributes.put("fragment.identifier", fragmentIdentifier);
attributes.put("fragment.index", String.valueOf(fragmentIndex));
}
flowfile = session.putAllAttributes(flowfile, attributes);
logger.info("{} contains {} Avro records; transferring to 'success'", new Object[] { flowfile, nrOfRows.get() });
if (context.hasIncomingConnection()) {
// If the flow file came from an incoming connection, issue a Fetch provenance event
session.getProvenanceReporter().fetch(flowfile, dbcpService.getConnectionURL(), "Retrieved " + nrOfRows.get() + " rows", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
} else {
// If we created a flow file from rows received from Hive, issue a Receive provenance event
session.getProvenanceReporter().receive(flowfile, dbcpService.getConnectionURL(), stopWatch.getElapsed(TimeUnit.MILLISECONDS));
}
resultSetFlowFiles.add(flowfile);
} else {
// If there were no rows returned (and the first flow file has been sent, we're done processing, so remove the flowfile and carry on
session.remove(flowfile);
break;
}
fragmentIndex++;
if (maxFragments > 0 && fragmentIndex >= maxFragments) {
break;
}
}
for (int i = 0; i < resultSetFlowFiles.size(); i++) {
// Set count on all FlowFiles
if (maxRowsPerFlowFile > 0) {
resultSetFlowFiles.set(i, session.putAttribute(resultSetFlowFiles.get(i), "fragment.count", Integer.toString(fragmentIndex)));
}
}
} catch (final SQLException e) {
throw e;
}
session.transfer(resultSetFlowFiles, REL_SUCCESS);
} catch (final ProcessException | SQLException e) {
logger.error("Issue processing SQL {} due to {}.", new Object[] { selectQuery, e });
if (flowfile == null) {
// This can happen if any exceptions occur while setting up the connection, statement, etc.
logger.error("Unable to execute HiveQL select query {} due to {}. No FlowFile to route to failure", new Object[] { selectQuery, e });
context.yield();
} else {
if (context.hasIncomingConnection()) {
logger.error("Unable to execute HiveQL select query {} for {} due to {}; routing to failure", new Object[] { selectQuery, flowfile, e });
flowfile = session.penalize(flowfile);
} else {
logger.error("Unable to execute HiveQL select query {} due to {}; routing to failure", new Object[] { selectQuery, e });
context.yield();
}
session.transfer(flowfile, REL_FAILURE);
}
} finally {
if (fileToProcess != null) {
session.remove(fileToProcess);
}
}
}
Aggregations