Search in sources :

Example 1 with AVRO

use of org.apache.nifi.util.hive.HiveJdbcCommon.AVRO in project nifi by apache.

the class SelectHiveQL method onTrigger.

private void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile fileToProcess = (context.hasIncomingConnection() ? session.get() : null);
    FlowFile flowfile = null;
    // we know that we should run only if we have a FlowFile.
    if (context.hasIncomingConnection()) {
        if (fileToProcess == null && context.hasNonLoopConnection()) {
            return;
        }
    }
    final ComponentLog logger = getLogger();
    final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
    final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
    final boolean flowbased = !(context.getProperty(HIVEQL_SELECT_QUERY).isSet());
    // Source the SQL
    final String selectQuery;
    if (context.getProperty(HIVEQL_SELECT_QUERY).isSet()) {
        selectQuery = context.getProperty(HIVEQL_SELECT_QUERY).evaluateAttributeExpressions(fileToProcess).getValue();
    } else {
        // If the query is not set, then an incoming flow file is required, and expected to contain a valid SQL select query.
        // If there is no incoming connection, onTrigger will not be called as the processor will fail when scheduled.
        final StringBuilder queryContents = new StringBuilder();
        session.read(fileToProcess, in -> queryContents.append(IOUtils.toString(in, charset)));
        selectQuery = queryContents.toString();
    }
    final Integer fetchSize = context.getProperty(FETCH_SIZE).evaluateAttributeExpressions(fileToProcess).asInteger();
    final Integer maxRowsPerFlowFile = context.getProperty(MAX_ROWS_PER_FLOW_FILE).evaluateAttributeExpressions(fileToProcess).asInteger();
    final Integer maxFragments = context.getProperty(MAX_FRAGMENTS).isSet() ? context.getProperty(MAX_FRAGMENTS).evaluateAttributeExpressions(fileToProcess).asInteger() : 0;
    final String outputFormat = context.getProperty(HIVEQL_OUTPUT_FORMAT).getValue();
    final boolean convertNamesForAvro = context.getProperty(NORMALIZE_NAMES_FOR_AVRO).asBoolean();
    final StopWatch stopWatch = new StopWatch(true);
    final boolean header = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
    final String altHeader = context.getProperty(HIVEQL_CSV_ALT_HEADER).evaluateAttributeExpressions(fileToProcess).getValue();
    final String delimiter = context.getProperty(HIVEQL_CSV_DELIMITER).evaluateAttributeExpressions(fileToProcess).getValue();
    final boolean quote = context.getProperty(HIVEQL_CSV_QUOTE).asBoolean();
    final boolean escape = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
    final String fragmentIdentifier = UUID.randomUUID().toString();
    try (final Connection con = dbcpService.getConnection();
        final Statement st = (flowbased ? con.prepareStatement(selectQuery) : con.createStatement())) {
        if (fetchSize != null && fetchSize > 0) {
            try {
                st.setFetchSize(fetchSize);
            } catch (SQLException se) {
                // Not all drivers support this, just log the error (at debug level) and move on
                logger.debug("Cannot set fetch size to {} due to {}", new Object[] { fetchSize, se.getLocalizedMessage() }, se);
            }
        }
        final List<FlowFile> resultSetFlowFiles = new ArrayList<>();
        try {
            logger.debug("Executing query {}", new Object[] { selectQuery });
            if (flowbased) {
                // Hive JDBC Doesn't Support this yet:
                // ParameterMetaData pmd = ((PreparedStatement)st).getParameterMetaData();
                // int paramCount = pmd.getParameterCount();
                // Alternate way to determine number of params in SQL.
                int paramCount = StringUtils.countMatches(selectQuery, "?");
                if (paramCount > 0) {
                    setParameters(1, (PreparedStatement) st, paramCount, fileToProcess.getAttributes());
                }
            }
            final ResultSet resultSet;
            try {
                resultSet = (flowbased ? ((PreparedStatement) st).executeQuery() : st.executeQuery(selectQuery));
            } catch (SQLException se) {
                // If an error occurs during the query, a flowfile is expected to be routed to failure, so ensure one here
                flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
                fileToProcess = null;
                throw se;
            }
            int fragmentIndex = 0;
            String baseFilename = (fileToProcess != null) ? fileToProcess.getAttribute(CoreAttributes.FILENAME.key()) : null;
            while (true) {
                final AtomicLong nrOfRows = new AtomicLong(0L);
                flowfile = (flowfile == null) ? session.create() : session.create(flowfile);
                if (baseFilename == null) {
                    baseFilename = flowfile.getAttribute(CoreAttributes.FILENAME.key());
                }
                try {
                    flowfile = session.write(flowfile, out -> {
                        try {
                            if (AVRO.equals(outputFormat)) {
                                nrOfRows.set(HiveJdbcCommon.convertToAvroStream(resultSet, out, maxRowsPerFlowFile, convertNamesForAvro));
                            } else if (CSV.equals(outputFormat)) {
                                CsvOutputOptions options = new CsvOutputOptions(header, altHeader, delimiter, quote, escape, maxRowsPerFlowFile);
                                nrOfRows.set(HiveJdbcCommon.convertToCsvStream(resultSet, out, options));
                            } else {
                                nrOfRows.set(0L);
                                throw new ProcessException("Unsupported output format: " + outputFormat);
                            }
                        } catch (final SQLException | RuntimeException e) {
                            throw new ProcessException("Error during database query or conversion of records.", e);
                        }
                    });
                } catch (ProcessException e) {
                    // Add flowfile to results before rethrowing so it will be removed from session in outer catch
                    resultSetFlowFiles.add(flowfile);
                    throw e;
                }
                if (nrOfRows.get() > 0 || resultSetFlowFiles.isEmpty()) {
                    final Map<String, String> attributes = new HashMap<>();
                    // Set attribute for how many rows were selected
                    attributes.put(RESULT_ROW_COUNT, String.valueOf(nrOfRows.get()));
                    try {
                        // Set input/output table names by parsing the query
                        attributes.putAll(toQueryTableAttributes(findTableNames(selectQuery)));
                    } catch (Exception e) {
                        // If failed to parse the query, just log a warning message, but continue.
                        getLogger().warn("Failed to parse query: {} due to {}", new Object[] { selectQuery, e }, e);
                    }
                    // Set MIME type on output document and add extension to filename
                    if (AVRO.equals(outputFormat)) {
                        attributes.put(CoreAttributes.MIME_TYPE.key(), MIME_TYPE_AVRO_BINARY);
                        attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".avro");
                    } else if (CSV.equals(outputFormat)) {
                        attributes.put(CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
                        attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".csv");
                    }
                    if (maxRowsPerFlowFile > 0) {
                        attributes.put("fragment.identifier", fragmentIdentifier);
                        attributes.put("fragment.index", String.valueOf(fragmentIndex));
                    }
                    flowfile = session.putAllAttributes(flowfile, attributes);
                    logger.info("{} contains {} Avro records; transferring to 'success'", new Object[] { flowfile, nrOfRows.get() });
                    if (context.hasIncomingConnection()) {
                        // If the flow file came from an incoming connection, issue a Fetch provenance event
                        session.getProvenanceReporter().fetch(flowfile, dbcpService.getConnectionURL(), "Retrieved " + nrOfRows.get() + " rows", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
                    } else {
                        // If we created a flow file from rows received from Hive, issue a Receive provenance event
                        session.getProvenanceReporter().receive(flowfile, dbcpService.getConnectionURL(), stopWatch.getElapsed(TimeUnit.MILLISECONDS));
                    }
                    resultSetFlowFiles.add(flowfile);
                } else {
                    // If there were no rows returned (and the first flow file has been sent, we're done processing, so remove the flowfile and carry on
                    session.remove(flowfile);
                    break;
                }
                fragmentIndex++;
                if (maxFragments > 0 && fragmentIndex >= maxFragments) {
                    break;
                }
            }
            for (int i = 0; i < resultSetFlowFiles.size(); i++) {
                // Set count on all FlowFiles
                if (maxRowsPerFlowFile > 0) {
                    resultSetFlowFiles.set(i, session.putAttribute(resultSetFlowFiles.get(i), "fragment.count", Integer.toString(fragmentIndex)));
                }
            }
        } catch (final SQLException e) {
            throw e;
        }
        session.transfer(resultSetFlowFiles, REL_SUCCESS);
    } catch (final ProcessException | SQLException e) {
        logger.error("Issue processing SQL {} due to {}.", new Object[] { selectQuery, e });
        if (flowfile == null) {
            // This can happen if any exceptions occur while setting up the connection, statement, etc.
            logger.error("Unable to execute HiveQL select query {} due to {}. No FlowFile to route to failure", new Object[] { selectQuery, e });
            context.yield();
        } else {
            if (context.hasIncomingConnection()) {
                logger.error("Unable to execute HiveQL select query {} for {} due to {}; routing to failure", new Object[] { selectQuery, flowfile, e });
                flowfile = session.penalize(flowfile);
            } else {
                logger.error("Unable to execute HiveQL select query {} due to {}; routing to failure", new Object[] { selectQuery, e });
                context.yield();
            }
            session.transfer(flowfile, REL_FAILURE);
        }
    } finally {
        if (fileToProcess != null) {
            session.remove(fileToProcess);
        }
    }
}
Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) StringUtils(org.apache.commons.lang.StringUtils) Connection(java.sql.Connection) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) NORMALIZE_NAMES_FOR_AVRO(org.apache.nifi.util.hive.HiveJdbcCommon.NORMALIZE_NAMES_FOR_AVRO) HashMap(java.util.HashMap) EventDriven(org.apache.nifi.annotation.behavior.EventDriven) CSV_MIME_TYPE(org.apache.nifi.util.hive.HiveJdbcCommon.CSV_MIME_TYPE) ComponentLog(org.apache.nifi.logging.ComponentLog) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ProcessException(org.apache.nifi.processor.exception.ProcessException) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) SQLException(java.sql.SQLException) Charset(java.nio.charset.Charset) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) Relationship(org.apache.nifi.processor.Relationship) CSV(org.apache.nifi.util.hive.HiveJdbcCommon.CSV) ResultSet(java.sql.ResultSet) Map(java.util.Map) HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) Requirement(org.apache.nifi.annotation.behavior.InputRequirement.Requirement) MIME_TYPE_AVRO_BINARY(org.apache.nifi.util.hive.HiveJdbcCommon.MIME_TYPE_AVRO_BINARY) PartialFunctions(org.apache.nifi.processor.util.pattern.PartialFunctions) HiveJdbcCommon(org.apache.nifi.util.hive.HiveJdbcCommon) FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessContext(org.apache.nifi.processor.ProcessContext) Set(java.util.Set) ProcessSession(org.apache.nifi.processor.ProcessSession) AVRO(org.apache.nifi.util.hive.HiveJdbcCommon.AVRO) UUID(java.util.UUID) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) PreparedStatement(java.sql.PreparedStatement) ProcessSessionFactory(org.apache.nifi.processor.ProcessSessionFactory) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) InputRequirement(org.apache.nifi.annotation.behavior.InputRequirement) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) CsvOutputOptions(org.apache.nifi.util.hive.CsvOutputOptions) Statement(java.sql.Statement) StopWatch(org.apache.nifi.util.StopWatch) Tags(org.apache.nifi.annotation.documentation.Tags) CoreAttributes(org.apache.nifi.flowfile.attributes.CoreAttributes) Collections(java.util.Collections) SQLException(java.sql.SQLException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ResultSet(java.sql.ResultSet) HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) FlowFile(org.apache.nifi.flowfile.FlowFile) PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) Connection(java.sql.Connection) Charset(java.nio.charset.Charset) ComponentLog(org.apache.nifi.logging.ComponentLog) ProcessException(org.apache.nifi.processor.exception.ProcessException) SQLException(java.sql.SQLException) StopWatch(org.apache.nifi.util.StopWatch) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CsvOutputOptions(org.apache.nifi.util.hive.CsvOutputOptions)

Aggregations

Charset (java.nio.charset.Charset)1 Connection (java.sql.Connection)1 PreparedStatement (java.sql.PreparedStatement)1 ResultSet (java.sql.ResultSet)1 SQLException (java.sql.SQLException)1 Statement (java.sql.Statement)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 UUID (java.util.UUID)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 IOUtils (org.apache.commons.io.IOUtils)1 StringUtils (org.apache.commons.lang.StringUtils)1 EventDriven (org.apache.nifi.annotation.behavior.EventDriven)1 InputRequirement (org.apache.nifi.annotation.behavior.InputRequirement)1