use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.
the class TestSelectHiveQL method testWithSqlException.
@Test
public void testWithSqlException() throws SQLException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_NO_ROWS");
} catch (final SQLException sqle) {
// Nothing to do, probably means the table didn't exist
}
stmt.execute("create table TEST_NO_ROWS (id integer)");
runner.setIncomingConnection(false);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT val1 FROM TEST_NO_ROWS");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}
use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.
the class TestSelectHiveQL method invokeOnTrigger.
public void invokeOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat) throws InitializationException, ClassNotFoundException, SQLException, IOException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
final Statement stmt = con.createStatement();
try {
stmt.execute("drop table persons");
} catch (final SQLException sqle) {
// Nothing to do here, the table didn't exist
}
stmt.execute("create table persons (id integer, name varchar(100), code integer)");
Random rng = new Random(53496);
final int nrOfRows = 100;
stmt.executeUpdate("insert into persons values (1, 'Joe Smith', " + rng.nextInt(469947) + ")");
for (int i = 2; i < nrOfRows; i++) {
stmt.executeUpdate("insert into persons values (" + i + ", 'Someone Else', " + rng.nextInt(469947) + ")");
}
stmt.executeUpdate("insert into persons values (" + nrOfRows + ", 'Last Person', NULL)");
LOGGER.info("test data loaded");
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, query);
runner.setProperty(HIVEQL_OUTPUT_FORMAT, outputFormat);
if (incomingFlowFile) {
// incoming FlowFile content is not used, but attributes are used
final Map<String, String> attributes = new HashMap<>();
attributes.put("person.id", "10");
runner.enqueue("Hello".getBytes(), attributes);
}
runner.setIncomingConnection(incomingFlowFile);
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
final List<MockFlowFile> flowfiles = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS);
MockFlowFile flowFile = flowfiles.get(0);
final InputStream in = new ByteArrayInputStream(flowFile.toByteArray());
long recordsFromStream = 0;
if (AVRO.equals(outputFormat)) {
assertEquals(MIME_TYPE_AVRO_BINARY, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
GenericRecord record = null;
while (dataFileReader.hasNext()) {
// Reuse record object by passing it to next(). This saves us from
// allocating and garbage collecting many objects for files with
// many items.
record = dataFileReader.next(record);
recordsFromStream++;
}
}
} else {
assertEquals(CSV_MIME_TYPE, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String headerRow = br.readLine();
// Derby capitalizes column names
assertEquals("PERSONID,PERSONNAME,PERSONCODE", headerRow);
// Validate rows
String line;
while ((line = br.readLine()) != null) {
recordsFromStream++;
String[] values = line.split(",");
if (recordsFromStream < (nrOfRows - 10)) {
assertEquals(3, values.length);
assertTrue(values[1].startsWith("\""));
assertTrue(values[1].endsWith("\""));
} else {
// Middle value is null
assertEquals(2, values.length);
}
}
}
assertEquals(nrOfRows - 10, recordsFromStream);
assertEquals(recordsFromStream, Integer.parseInt(flowFile.getAttribute(SelectHiveQL.RESULT_ROW_COUNT)));
flowFile.assertAttributeEquals(AbstractHiveQLProcessor.ATTR_INPUT_TABLES, "persons");
}
use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.
the class SelectHiveQL method onTrigger.
private void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile fileToProcess = (context.hasIncomingConnection() ? session.get() : null);
FlowFile flowfile = null;
// we know that we should run only if we have a FlowFile.
if (context.hasIncomingConnection()) {
if (fileToProcess == null && context.hasNonLoopConnection()) {
return;
}
}
final ComponentLog logger = getLogger();
final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
final boolean flowbased = !(context.getProperty(HIVEQL_SELECT_QUERY).isSet());
// Source the SQL
final String selectQuery;
if (context.getProperty(HIVEQL_SELECT_QUERY).isSet()) {
selectQuery = context.getProperty(HIVEQL_SELECT_QUERY).evaluateAttributeExpressions(fileToProcess).getValue();
} else {
// If the query is not set, then an incoming flow file is required, and expected to contain a valid SQL select query.
// If there is no incoming connection, onTrigger will not be called as the processor will fail when scheduled.
final StringBuilder queryContents = new StringBuilder();
session.read(fileToProcess, in -> queryContents.append(IOUtils.toString(in, charset)));
selectQuery = queryContents.toString();
}
final Integer fetchSize = context.getProperty(FETCH_SIZE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxRowsPerFlowFile = context.getProperty(MAX_ROWS_PER_FLOW_FILE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxFragments = context.getProperty(MAX_FRAGMENTS).isSet() ? context.getProperty(MAX_FRAGMENTS).evaluateAttributeExpressions(fileToProcess).asInteger() : 0;
final String outputFormat = context.getProperty(HIVEQL_OUTPUT_FORMAT).getValue();
final boolean convertNamesForAvro = context.getProperty(NORMALIZE_NAMES_FOR_AVRO).asBoolean();
final StopWatch stopWatch = new StopWatch(true);
final boolean header = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String altHeader = context.getProperty(HIVEQL_CSV_ALT_HEADER).evaluateAttributeExpressions(fileToProcess).getValue();
final String delimiter = context.getProperty(HIVEQL_CSV_DELIMITER).evaluateAttributeExpressions(fileToProcess).getValue();
final boolean quote = context.getProperty(HIVEQL_CSV_QUOTE).asBoolean();
final boolean escape = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String fragmentIdentifier = UUID.randomUUID().toString();
try (final Connection con = dbcpService.getConnection();
final Statement st = (flowbased ? con.prepareStatement(selectQuery) : con.createStatement())) {
if (fetchSize != null && fetchSize > 0) {
try {
st.setFetchSize(fetchSize);
} catch (SQLException se) {
// Not all drivers support this, just log the error (at debug level) and move on
logger.debug("Cannot set fetch size to {} due to {}", new Object[] { fetchSize, se.getLocalizedMessage() }, se);
}
}
final List<FlowFile> resultSetFlowFiles = new ArrayList<>();
try {
logger.debug("Executing query {}", new Object[] { selectQuery });
if (flowbased) {
// Hive JDBC Doesn't Support this yet:
// ParameterMetaData pmd = ((PreparedStatement)st).getParameterMetaData();
// int paramCount = pmd.getParameterCount();
// Alternate way to determine number of params in SQL.
int paramCount = StringUtils.countMatches(selectQuery, "?");
if (paramCount > 0) {
setParameters(1, (PreparedStatement) st, paramCount, fileToProcess.getAttributes());
}
}
final ResultSet resultSet;
try {
resultSet = (flowbased ? ((PreparedStatement) st).executeQuery() : st.executeQuery(selectQuery));
} catch (SQLException se) {
// If an error occurs during the query, a flowfile is expected to be routed to failure, so ensure one here
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw se;
}
int fragmentIndex = 0;
String baseFilename = (fileToProcess != null) ? fileToProcess.getAttribute(CoreAttributes.FILENAME.key()) : null;
while (true) {
final AtomicLong nrOfRows = new AtomicLong(0L);
flowfile = (flowfile == null) ? session.create() : session.create(flowfile);
if (baseFilename == null) {
baseFilename = flowfile.getAttribute(CoreAttributes.FILENAME.key());
}
try {
flowfile = session.write(flowfile, out -> {
try {
if (AVRO.equals(outputFormat)) {
nrOfRows.set(HiveJdbcCommon.convertToAvroStream(resultSet, out, maxRowsPerFlowFile, convertNamesForAvro));
} else if (CSV.equals(outputFormat)) {
CsvOutputOptions options = new CsvOutputOptions(header, altHeader, delimiter, quote, escape, maxRowsPerFlowFile);
nrOfRows.set(HiveJdbcCommon.convertToCsvStream(resultSet, out, options));
} else {
nrOfRows.set(0L);
throw new ProcessException("Unsupported output format: " + outputFormat);
}
} catch (final SQLException | RuntimeException e) {
throw new ProcessException("Error during database query or conversion of records.", e);
}
});
} catch (ProcessException e) {
// Add flowfile to results before rethrowing so it will be removed from session in outer catch
resultSetFlowFiles.add(flowfile);
throw e;
}
if (nrOfRows.get() > 0 || resultSetFlowFiles.isEmpty()) {
final Map<String, String> attributes = new HashMap<>();
// Set attribute for how many rows were selected
attributes.put(RESULT_ROW_COUNT, String.valueOf(nrOfRows.get()));
try {
// Set input/output table names by parsing the query
attributes.putAll(toQueryTableAttributes(findTableNames(selectQuery)));
} catch (Exception e) {
// If failed to parse the query, just log a warning message, but continue.
getLogger().warn("Failed to parse query: {} due to {}", new Object[] { selectQuery, e }, e);
}
// Set MIME type on output document and add extension to filename
if (AVRO.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), MIME_TYPE_AVRO_BINARY);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".avro");
} else if (CSV.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".csv");
}
if (maxRowsPerFlowFile > 0) {
attributes.put("fragment.identifier", fragmentIdentifier);
attributes.put("fragment.index", String.valueOf(fragmentIndex));
}
flowfile = session.putAllAttributes(flowfile, attributes);
logger.info("{} contains {} Avro records; transferring to 'success'", new Object[] { flowfile, nrOfRows.get() });
if (context.hasIncomingConnection()) {
// If the flow file came from an incoming connection, issue a Fetch provenance event
session.getProvenanceReporter().fetch(flowfile, dbcpService.getConnectionURL(), "Retrieved " + nrOfRows.get() + " rows", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
} else {
// If we created a flow file from rows received from Hive, issue a Receive provenance event
session.getProvenanceReporter().receive(flowfile, dbcpService.getConnectionURL(), stopWatch.getElapsed(TimeUnit.MILLISECONDS));
}
resultSetFlowFiles.add(flowfile);
} else {
// If there were no rows returned (and the first flow file has been sent, we're done processing, so remove the flowfile and carry on
session.remove(flowfile);
break;
}
fragmentIndex++;
if (maxFragments > 0 && fragmentIndex >= maxFragments) {
break;
}
}
for (int i = 0; i < resultSetFlowFiles.size(); i++) {
// Set count on all FlowFiles
if (maxRowsPerFlowFile > 0) {
resultSetFlowFiles.set(i, session.putAttribute(resultSetFlowFiles.get(i), "fragment.count", Integer.toString(fragmentIndex)));
}
}
} catch (final SQLException e) {
throw e;
}
session.transfer(resultSetFlowFiles, REL_SUCCESS);
} catch (final ProcessException | SQLException e) {
logger.error("Issue processing SQL {} due to {}.", new Object[] { selectQuery, e });
if (flowfile == null) {
// This can happen if any exceptions occur while setting up the connection, statement, etc.
logger.error("Unable to execute HiveQL select query {} due to {}. No FlowFile to route to failure", new Object[] { selectQuery, e });
context.yield();
} else {
if (context.hasIncomingConnection()) {
logger.error("Unable to execute HiveQL select query {} for {} due to {}; routing to failure", new Object[] { selectQuery, flowfile, e });
flowfile = session.penalize(flowfile);
} else {
logger.error("Unable to execute HiveQL select query {} due to {}; routing to failure", new Object[] { selectQuery, e });
context.yield();
}
session.transfer(flowfile, REL_FAILURE);
}
} finally {
if (fileToProcess != null) {
session.remove(fileToProcess);
}
}
}
use of org.apache.nifi.dbcp.hive.HiveDBCPService in project nifi by apache.
the class TestSelectHiveQL method testWithNullIntColumn.
@Test
public void testWithNullIntColumn() throws SQLException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_NULL_INT");
} catch (final SQLException sqle) {
// Nothing to do, probably means the table didn't exist
}
stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))");
stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)");
stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)");
runner.setIncomingConnection(false);
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(0).assertAttributeEquals(SelectHiveQL.RESULT_ROW_COUNT, "2");
}
use of org.apache.nifi.dbcp.hive.HiveDBCPService in project kylo by Teradata.
the class MergeTable method getConnection.
@Override
public Connection getConnection(ProcessContext context) {
ThriftService thriftService = context.getProperty(THRIFT_SERVICE).asControllerService(ThriftService.class);
HiveDBCPService hiveConnectionPool = context.getProperty(HIVE_CONNECTION_POOL).asControllerService(HiveDBCPService.class);
if (hiveConnectionPool != null) {
getLogger().info("Returning Connection from HiveConnectionPool");
return hiveConnectionPool.getConnection();
} else {
getLogger().info("Returning Connection from ThriftConnectionPool");
return thriftService.getConnection();
}
}
Aggregations