Examples with HiveDBCPService - org.apache.nifi.dbcp.hive.HiveDBCPService

Example 1 with HiveDBCPService

use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.

the class TestSelectHiveQL method testWithSqlException.

@Test
public void testWithSqlException() throws SQLException {
    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();
    // load test data to database
    final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();
    try {
        stmt.execute("drop table TEST_NO_ROWS");
    } catch (final SQLException sqle) {
    // Nothing to do, probably means the table didn't exist
    }
    stmt.execute("create table TEST_NO_ROWS (id integer)");
    runner.setIncomingConnection(false);
    // Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
    runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT val1 FROM TEST_NO_ROWS");
    runner.run();
    runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}

Also used : HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) SQLException(java.sql.SQLException) Statement(java.sql.Statement) Connection(java.sql.Connection) File(java.io.File) MockFlowFile(org.apache.nifi.util.MockFlowFile) Test(org.junit.Test)

Example 2 with HiveDBCPService

use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.

the class TestSelectHiveQL method invokeOnTrigger.

public void invokeOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat) throws InitializationException, ClassNotFoundException, SQLException, IOException {
    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();
    // load test data to database
    final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
    final Statement stmt = con.createStatement();
    try {
        stmt.execute("drop table persons");
    } catch (final SQLException sqle) {
    // Nothing to do here, the table didn't exist
    }
    stmt.execute("create table persons (id integer, name varchar(100), code integer)");
    Random rng = new Random(53496);
    final int nrOfRows = 100;
    stmt.executeUpdate("insert into persons values (1, 'Joe Smith', " + rng.nextInt(469947) + ")");
    for (int i = 2; i < nrOfRows; i++) {
        stmt.executeUpdate("insert into persons values (" + i + ", 'Someone Else', " + rng.nextInt(469947) + ")");
    }
    stmt.executeUpdate("insert into persons values (" + nrOfRows + ", 'Last Person', NULL)");
    LOGGER.info("test data loaded");
    runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, query);
    runner.setProperty(HIVEQL_OUTPUT_FORMAT, outputFormat);
    if (incomingFlowFile) {
        // incoming FlowFile content is not used, but attributes are used
        final Map<String, String> attributes = new HashMap<>();
        attributes.put("person.id", "10");
        runner.enqueue("Hello".getBytes(), attributes);
    }
    runner.setIncomingConnection(incomingFlowFile);
    runner.run();
    runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
    final List<MockFlowFile> flowfiles = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS);
    MockFlowFile flowFile = flowfiles.get(0);
    final InputStream in = new ByteArrayInputStream(flowFile.toByteArray());
    long recordsFromStream = 0;
    if (AVRO.equals(outputFormat)) {
        assertEquals(MIME_TYPE_AVRO_BINARY, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
        final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
        try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
            GenericRecord record = null;
            while (dataFileReader.hasNext()) {
                // Reuse record object by passing it to next(). This saves us from
                // allocating and garbage collecting many objects for files with
                // many items.
                record = dataFileReader.next(record);
                recordsFromStream++;
            }
        }
    } else {
        assertEquals(CSV_MIME_TYPE, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
        BufferedReader br = new BufferedReader(new InputStreamReader(in));
        String headerRow = br.readLine();
        // Derby capitalizes column names
        assertEquals("PERSONID,PERSONNAME,PERSONCODE", headerRow);
        // Validate rows
        String line;
        while ((line = br.readLine()) != null) {
            recordsFromStream++;
            String[] values = line.split(",");
            if (recordsFromStream < (nrOfRows - 10)) {
                assertEquals(3, values.length);
                assertTrue(values[1].startsWith("\""));
                assertTrue(values[1].endsWith("\""));
            } else {
                // Middle value is null
                assertEquals(2, values.length);
            }
        }
    }
    assertEquals(nrOfRows - 10, recordsFromStream);
    assertEquals(recordsFromStream, Integer.parseInt(flowFile.getAttribute(SelectHiveQL.RESULT_ROW_COUNT)));
    flowFile.assertAttributeEquals(AbstractHiveQLProcessor.ATTR_INPUT_TABLES, "persons");
}

Also used : HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) InputStreamReader(java.io.InputStreamReader) SQLException(java.sql.SQLException) HashMap(java.util.HashMap) Statement(java.sql.Statement) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Connection(java.sql.Connection) DataFileStream(org.apache.avro.file.DataFileStream) MockFlowFile(org.apache.nifi.util.MockFlowFile) Random(java.util.Random) ByteArrayInputStream(java.io.ByteArrayInputStream) BufferedReader(java.io.BufferedReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) MockFlowFile(org.apache.nifi.util.MockFlowFile)

Example 3 with HiveDBCPService

use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.

the class SelectHiveQL method onTrigger.

private void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile fileToProcess = (context.hasIncomingConnection() ? session.get() : null);
    FlowFile flowfile = null;
    // we know that we should run only if we have a FlowFile.
    if (context.hasIncomingConnection()) {
        if (fileToProcess == null && context.hasNonLoopConnection()) {
            return;
        }
    }
    final ComponentLog logger = getLogger();
    final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
    final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
    final boolean flowbased = !(context.getProperty(HIVEQL_SELECT_QUERY).isSet());
    // Source the SQL
    final String selectQuery;
    if (context.getProperty(HIVEQL_SELECT_QUERY).isSet()) {
        selectQuery = context.getProperty(HIVEQL_SELECT_QUERY).evaluateAttributeExpressions(fileToProcess).getValue();
    } else {
        // If the query is not set, then an incoming flow file is required, and expected to contain a valid SQL select query.
        // If there is no incoming connection, onTrigger will not be called as the processor will fail when scheduled.
        final StringBuilder queryContents = new StringBuilder();
        session.read(fileToProcess, in -> queryContents.append(IOUtils.toString(in, charset)));
        selectQuery = queryContents.toString();
    }
    final Integer fetchSize = context.getProperty(FETCH_SIZE).evaluateAttributeExpressions(fileToProcess).asInteger();
    final Integer maxRowsPerFlowFile = context.getProperty(MAX_ROWS_PER_FLOW_FILE).evaluateAttributeExpressions(fileToProcess).asInteger();
    final Integer maxFragments = context.getProperty(MAX_FRAGMENTS).isSet() ? context.getProperty(MAX_FRAGMENTS).evaluateAttributeExpressions(fileToProcess).asInteger() : 0;
    final String outputFormat = context.getProperty(HIVEQL_OUTPUT_FORMAT).getValue();
    final boolean convertNamesForAvro = context.getProperty(NORMALIZE_NAMES_FOR_AVRO).asBoolean();
    final StopWatch stopWatch = new StopWatch(true);
    final boolean header = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
    final String altHeader = context.getProperty(HIVEQL_CSV_ALT_HEADER).evaluateAttributeExpressions(fileToProcess).getValue();
    final String delimiter = context.getProperty(HIVEQL_CSV_DELIMITER).evaluateAttributeExpressions(fileToProcess).getValue();
    final boolean quote = context.getProperty(HIVEQL_CSV_QUOTE).asBoolean();
    final boolean escape = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
    final String fragmentIdentifier = UUID.randomUUID().toString();
    try (final Connection con = dbcpService.getConnection();
        final Statement st = (flowbased ? con.prepareStatement(selectQuery) : con.createStatement())) {
        if (fetchSize != null && fetchSize > 0) {
            try {
                st.setFetchSize(fetchSize);
            } catch (SQLException se) {
                // Not all drivers support this, just log the error (at debug level) and move on
                logger.debug("Cannot set fetch size to {} due to {}", new Object[] { fetchSize, se.getLocalizedMessage() }, se);
            }
        }
        final List<FlowFile> resultSetFlowFiles = new ArrayList<>();
        try {
            logger.debug("Executing query {}", new Object[] { selectQuery });
            if (flowbased) {
                // Hive JDBC Doesn't Support this yet:
                // ParameterMetaData pmd = ((PreparedStatement)st).getParameterMetaData();
                // int paramCount = pmd.getParameterCount();
                // Alternate way to determine number of params in SQL.
                int paramCount = StringUtils.countMatches(selectQuery, "?");
                if (paramCount > 0) {
                    setParameters(1, (PreparedStatement) st, paramCount, fileToProcess.getAttributes());
                }
            }
            final ResultSet resultSet;
            try {
                resultSet = (flowbased ? ((PreparedStatement) st).executeQuery() : st.executeQuery(selectQuery));
            } catch (SQLException se) {
                // If an error occurs during the query, a flowfile is expected to be routed to failure, so ensure one here
                flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
                fileToProcess = null;
                throw se;
            }
            int fragmentIndex = 0;
            String baseFilename = (fileToProcess != null) ? fileToProcess.getAttribute(CoreAttributes.FILENAME.key()) : null;
            while (true) {
                final AtomicLong nrOfRows = new AtomicLong(0L);
                flowfile = (flowfile == null) ? session.create() : session.create(flowfile);
                if (baseFilename == null) {
                    baseFilename = flowfile.getAttribute(CoreAttributes.FILENAME.key());
                }
                try {
                    flowfile = session.write(flowfile, out -> {
                        try {
                            if (AVRO.equals(outputFormat)) {
                                nrOfRows.set(HiveJdbcCommon.convertToAvroStream(resultSet, out, maxRowsPerFlowFile, convertNamesForAvro));
                            } else if (CSV.equals(outputFormat)) {
                                CsvOutputOptions options = new CsvOutputOptions(header, altHeader, delimiter, quote, escape, maxRowsPerFlowFile);
                                nrOfRows.set(HiveJdbcCommon.convertToCsvStream(resultSet, out, options));
                            } else {
                                nrOfRows.set(0L);
                                throw new ProcessException("Unsupported output format: " + outputFormat);
                            }
                        } catch (final SQLException | RuntimeException e) {
                            throw new ProcessException("Error during database query or conversion of records.", e);
                        }
                    });
                } catch (ProcessException e) {
                    // Add flowfile to results before rethrowing so it will be removed from session in outer catch
                    resultSetFlowFiles.add(flowfile);
                    throw e;
                }
                if (nrOfRows.get() > 0 || resultSetFlowFiles.isEmpty()) {
                    final Map<String, String> attributes = new HashMap<>();
                    // Set attribute for how many rows were selected
                    attributes.put(RESULT_ROW_COUNT, String.valueOf(nrOfRows.get()));
                    try {
                        // Set input/output table names by parsing the query
                        attributes.putAll(toQueryTableAttributes(findTableNames(selectQuery)));
                    } catch (Exception e) {
                        // If failed to parse the query, just log a warning message, but continue.
                        getLogger().warn("Failed to parse query: {} due to {}", new Object[] { selectQuery, e }, e);
                    }
                    // Set MIME type on output document and add extension to filename
                    if (AVRO.equals(outputFormat)) {
                        attributes.put(CoreAttributes.MIME_TYPE.key(), MIME_TYPE_AVRO_BINARY);
                        attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".avro");
                    } else if (CSV.equals(outputFormat)) {
                        attributes.put(CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
                        attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".csv");
                    }
                    if (maxRowsPerFlowFile > 0) {
                        attributes.put("fragment.identifier", fragmentIdentifier);
                        attributes.put("fragment.index", String.valueOf(fragmentIndex));
                    }
                    flowfile = session.putAllAttributes(flowfile, attributes);
                    logger.info("{} contains {} Avro records; transferring to 'success'", new Object[] { flowfile, nrOfRows.get() });
                    if (context.hasIncomingConnection()) {
                        // If the flow file came from an incoming connection, issue a Fetch provenance event
                        session.getProvenanceReporter().fetch(flowfile, dbcpService.getConnectionURL(), "Retrieved " + nrOfRows.get() + " rows", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
                    } else {
                        // If we created a flow file from rows received from Hive, issue a Receive provenance event
                        session.getProvenanceReporter().receive(flowfile, dbcpService.getConnectionURL(), stopWatch.getElapsed(TimeUnit.MILLISECONDS));
                    }
                    resultSetFlowFiles.add(flowfile);
                } else {
                    // If there were no rows returned (and the first flow file has been sent, we're done processing, so remove the flowfile and carry on
                    session.remove(flowfile);
                    break;
                }
                fragmentIndex++;
                if (maxFragments > 0 && fragmentIndex >= maxFragments) {
                    break;
                }
            }
            for (int i = 0; i < resultSetFlowFiles.size(); i++) {
                // Set count on all FlowFiles
                if (maxRowsPerFlowFile > 0) {
                    resultSetFlowFiles.set(i, session.putAttribute(resultSetFlowFiles.get(i), "fragment.count", Integer.toString(fragmentIndex)));
                }
            }
        } catch (final SQLException e) {
            throw e;
        }
        session.transfer(resultSetFlowFiles, REL_SUCCESS);
    } catch (final ProcessException | SQLException e) {
        logger.error("Issue processing SQL {} due to {}.", new Object[] { selectQuery, e });
        if (flowfile == null) {
            // This can happen if any exceptions occur while setting up the connection, statement, etc.
            logger.error("Unable to execute HiveQL select query {} due to {}. No FlowFile to route to failure", new Object[] { selectQuery, e });
            context.yield();
        } else {
            if (context.hasIncomingConnection()) {
                logger.error("Unable to execute HiveQL select query {} for {} due to {}; routing to failure", new Object[] { selectQuery, flowfile, e });
                flowfile = session.penalize(flowfile);
            } else {
                logger.error("Unable to execute HiveQL select query {} due to {}; routing to failure", new Object[] { selectQuery, e });
                context.yield();
            }
            session.transfer(flowfile, REL_FAILURE);
        }
    } finally {
        if (fileToProcess != null) {
            session.remove(fileToProcess);
        }
    }
}

Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) StringUtils(org.apache.commons.lang.StringUtils) Connection(java.sql.Connection) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) NORMALIZE_NAMES_FOR_AVRO(org.apache.nifi.util.hive.HiveJdbcCommon.NORMALIZE_NAMES_FOR_AVRO) HashMap(java.util.HashMap) EventDriven(org.apache.nifi.annotation.behavior.EventDriven) CSV_MIME_TYPE(org.apache.nifi.util.hive.HiveJdbcCommon.CSV_MIME_TYPE) ComponentLog(org.apache.nifi.logging.ComponentLog) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ProcessException(org.apache.nifi.processor.exception.ProcessException) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) SQLException(java.sql.SQLException) Charset(java.nio.charset.Charset) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) Relationship(org.apache.nifi.processor.Relationship) CSV(org.apache.nifi.util.hive.HiveJdbcCommon.CSV) ResultSet(java.sql.ResultSet) Map(java.util.Map) HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) Requirement(org.apache.nifi.annotation.behavior.InputRequirement.Requirement) MIME_TYPE_AVRO_BINARY(org.apache.nifi.util.hive.HiveJdbcCommon.MIME_TYPE_AVRO_BINARY) PartialFunctions(org.apache.nifi.processor.util.pattern.PartialFunctions) HiveJdbcCommon(org.apache.nifi.util.hive.HiveJdbcCommon) FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessContext(org.apache.nifi.processor.ProcessContext) Set(java.util.Set) ProcessSession(org.apache.nifi.processor.ProcessSession) AVRO(org.apache.nifi.util.hive.HiveJdbcCommon.AVRO) UUID(java.util.UUID) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) PreparedStatement(java.sql.PreparedStatement) ProcessSessionFactory(org.apache.nifi.processor.ProcessSessionFactory) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) InputRequirement(org.apache.nifi.annotation.behavior.InputRequirement) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) CsvOutputOptions(org.apache.nifi.util.hive.CsvOutputOptions) Statement(java.sql.Statement) StopWatch(org.apache.nifi.util.StopWatch) Tags(org.apache.nifi.annotation.documentation.Tags) CoreAttributes(org.apache.nifi.flowfile.attributes.CoreAttributes) Collections(java.util.Collections) SQLException(java.sql.SQLException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ResultSet(java.sql.ResultSet) HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) FlowFile(org.apache.nifi.flowfile.FlowFile) PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) Connection(java.sql.Connection) Charset(java.nio.charset.Charset) ComponentLog(org.apache.nifi.logging.ComponentLog) ProcessException(org.apache.nifi.processor.exception.ProcessException) SQLException(java.sql.SQLException) StopWatch(org.apache.nifi.util.StopWatch) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CsvOutputOptions(org.apache.nifi.util.hive.CsvOutputOptions)

Example 4 with HiveDBCPService

use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.

the class TestSelectHiveQL method testWithNullIntColumn.

@Test
public void testWithNullIntColumn() throws SQLException {
    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();
    // load test data to database
    final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();
    try {
        stmt.execute("drop table TEST_NULL_INT");
    } catch (final SQLException sqle) {
    // Nothing to do, probably means the table didn't exist
    }
    stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)");
    runner.setIncomingConnection(false);
    runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT");
    runner.run();
    runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
    runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(0).assertAttributeEquals(SelectHiveQL.RESULT_ROW_COUNT, "2");
}

Example 5 with HiveDBCPService

use of org.apache.nifi.dbcp.hive.HiveDBCPService in project kylo by Teradata.

the class MergeTable method getConnection.

@Override
public Connection getConnection(ProcessContext context) {
    ThriftService thriftService = context.getProperty(THRIFT_SERVICE).asControllerService(ThriftService.class);
    HiveDBCPService hiveConnectionPool = context.getProperty(HIVE_CONNECTION_POOL).asControllerService(HiveDBCPService.class);
    if (hiveConnectionPool != null) {
        getLogger().info("Returning Connection from HiveConnectionPool");
        return hiveConnectionPool.getConnection();
    } else {
        getLogger().info("Returning Connection from ThriftConnectionPool");
        return thriftService.getConnection();
    }
}

Also used : HiveDBCPService(org.apache.nifi.dbcp.hive.HiveDBCPService) ThriftService(com.thinkbiganalytics.nifi.v2.thrift.ThriftService)

Aggregations

HiveDBCPService (org.apache.nifi.dbcp.hive.HiveDBCPService)5 Connection (java.sql.Connection)4 SQLException (java.sql.SQLException)4 Statement (java.sql.Statement)4 File (java.io.File)3 HashMap (java.util.HashMap)2 MockFlowFile (org.apache.nifi.util.MockFlowFile)2 ThriftService (com.thinkbiganalytics.nifi.v2.thrift.ThriftService)1 BufferedReader (java.io.BufferedReader)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 Charset (java.nio.charset.Charset)1 PreparedStatement (java.sql.PreparedStatement)1 ResultSet (java.sql.ResultSet)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1