Search in sources :

Example 1 with RowSerializer

use of org.apache.nifi.hbase.io.RowSerializer in project nifi by apache.

the class FetchHBaseRow method onTrigger.

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
    if (StringUtils.isBlank(tableName)) {
        getLogger().error("Table Name is blank or null for {}, transferring to failure", new Object[] { flowFile });
        session.transfer(session.penalize(flowFile), REL_FAILURE);
        return;
    }
    final String rowId = context.getProperty(ROW_ID).evaluateAttributeExpressions(flowFile).getValue();
    if (StringUtils.isBlank(rowId)) {
        getLogger().error("Row Identifier is blank or null for {}, transferring to failure", new Object[] { flowFile });
        session.transfer(session.penalize(flowFile), REL_FAILURE);
        return;
    }
    final List<Column> columns = getColumns(context.getProperty(COLUMNS).evaluateAttributeExpressions(flowFile).getValue());
    final HBaseClientService hBaseClientService = context.getProperty(HBASE_CLIENT_SERVICE).asControllerService(HBaseClientService.class);
    final String destination = context.getProperty(DESTINATION).getValue();
    final boolean base64Encode = context.getProperty(JSON_VALUE_ENCODING).getValue().equals(ENCODING_BASE64.getValue());
    final RowSerializer rowSerializer = base64Encode ? base64RowSerializer : regularRowSerializer;
    final FetchHBaseRowHandler handler = destination.equals(DESTINATION_CONTENT.getValue()) ? new FlowFileContentHandler(flowFile, session, rowSerializer) : new FlowFileAttributeHandler(flowFile, session, rowSerializer);
    final byte[] rowIdBytes = rowId.getBytes(StandardCharsets.UTF_8);
    try {
        hBaseClientService.scan(tableName, rowIdBytes, rowIdBytes, columns, handler);
    } catch (Exception e) {
        getLogger().error("Unable to fetch row {} from  {} due to {}", new Object[] { rowId, tableName, e });
        session.transfer(handler.getFlowFile(), REL_FAILURE);
        return;
    }
    FlowFile handlerFlowFile = handler.getFlowFile();
    if (!handler.handledRow()) {
        getLogger().debug("Row {} not found in {}, transferring to not found", new Object[] { rowId, tableName });
        session.transfer(handlerFlowFile, REL_NOT_FOUND);
        return;
    }
    if (getLogger().isDebugEnabled()) {
        getLogger().debug("Fetched {} from {} with row id {}", new Object[] { handlerFlowFile, tableName, rowId });
    }
    final Map<String, String> attributes = new HashMap<>();
    attributes.put(HBASE_TABLE_ATTR, tableName);
    if (destination.equals(DESTINATION_CONTENT.getValue())) {
        attributes.put(CoreAttributes.MIME_TYPE.key(), "application/json");
    }
    handlerFlowFile = session.putAllAttributes(handlerFlowFile, attributes);
    final String transitUri = hBaseClientService.toTransitUri(tableName, rowId);
    // Regardless to where the result is written to, emit a fetch event.
    session.getProvenanceReporter().fetch(handlerFlowFile, transitUri);
    if (!destination.equals(DESTINATION_CONTENT.getValue())) {
        session.getProvenanceReporter().modifyAttributes(handlerFlowFile, "Added attributes to FlowFile from " + transitUri);
    }
    session.transfer(handlerFlowFile, REL_SUCCESS);
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) HashMap(java.util.HashMap) ProcessException(org.apache.nifi.processor.exception.ProcessException) Column(org.apache.nifi.hbase.scan.Column) JsonFullRowSerializer(org.apache.nifi.hbase.io.JsonFullRowSerializer) RowSerializer(org.apache.nifi.hbase.io.RowSerializer) JsonQualifierAndValueRowSerializer(org.apache.nifi.hbase.io.JsonQualifierAndValueRowSerializer)

Example 2 with RowSerializer

use of org.apache.nifi.hbase.io.RowSerializer in project nifi by apache.

the class GetHBase method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    final String tableName = context.getProperty(TABLE_NAME).getValue();
    final String initialTimeRange = context.getProperty(INITIAL_TIMERANGE).getValue();
    final String filterExpression = context.getProperty(FILTER_EXPRESSION).getValue();
    final HBaseClientService hBaseClientService = context.getProperty(HBASE_CLIENT_SERVICE).asControllerService(HBaseClientService.class);
    // if the table was changed then remove any previous state
    if (previousTable != null && !tableName.equals(previousTable)) {
        try {
            context.getStateManager().clear(Scope.CLUSTER);
        } catch (final IOException ioe) {
            getLogger().warn("Failed to clear Cluster State", ioe);
        }
        previousTable = tableName;
    }
    try {
        final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
        final RowSerializer serializer = new JsonRowSerializer(charset);
        this.lastResult = getState(context.getStateManager());
        final long defaultMinTime = (initialTimeRange.equals(NONE.getValue()) ? 0L : System.currentTimeMillis());
        final long minTime = (lastResult == null ? defaultMinTime : lastResult.getTimestamp());
        final Map<String, Set<String>> cellsMatchingTimestamp = new HashMap<>();
        final AtomicReference<Long> rowsPulledHolder = new AtomicReference<>(0L);
        final AtomicReference<Long> latestTimestampHolder = new AtomicReference<>(minTime);
        hBaseClientService.scan(tableName, columns, filterExpression, minTime, new ResultHandler() {

            @Override
            public void handle(final byte[] rowKey, final ResultCell[] resultCells) {
                final String rowKeyString = new String(rowKey, StandardCharsets.UTF_8);
                // check if latest cell timestamp is equal to our cutoff.
                // if any of the cells have a timestamp later than our cutoff, then we
                // want the row. But if the cell with the latest timestamp is equal to
                // our cutoff, then we want to check if that's one of the cells that
                // we have already seen.
                long latestCellTimestamp = 0L;
                for (final ResultCell cell : resultCells) {
                    if (cell.getTimestamp() > latestCellTimestamp) {
                        latestCellTimestamp = cell.getTimestamp();
                    }
                }
                // we've already seen this.
                if (latestCellTimestamp < minTime) {
                    getLogger().debug("latest cell timestamp for row {} is {}, which is earlier than the minimum time of {}", new Object[] { rowKeyString, latestCellTimestamp, minTime });
                    return;
                }
                if (latestCellTimestamp == minTime) {
                    // latest cell timestamp is equal to our minimum time. Check if all cells that have
                    // that timestamp are in our list of previously seen cells.
                    boolean allSeen = true;
                    for (final ResultCell cell : resultCells) {
                        if (cell.getTimestamp() == latestCellTimestamp) {
                            if (lastResult == null || !lastResult.contains(cell)) {
                                allSeen = false;
                                break;
                            }
                        }
                    }
                    if (allSeen) {
                        // we have already seen all of the cells for this row. We do not want to
                        // include this cell in our output.
                        getLogger().debug("all cells for row {} have already been seen", new Object[] { rowKeyString });
                        return;
                    }
                }
                // we can ignore these cells.
                if (latestCellTimestamp >= latestTimestampHolder.get()) {
                    // new timestamp, so clear all of the 'matching cells'
                    if (latestCellTimestamp > latestTimestampHolder.get()) {
                        latestTimestampHolder.set(latestCellTimestamp);
                        cellsMatchingTimestamp.clear();
                    }
                    for (final ResultCell cell : resultCells) {
                        final long ts = cell.getTimestamp();
                        if (ts == latestCellTimestamp) {
                            final byte[] rowValue = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength() + cell.getRowOffset());
                            final byte[] cellValue = Arrays.copyOfRange(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength() + cell.getValueOffset());
                            final String rowHash = new String(rowValue, StandardCharsets.UTF_8);
                            Set<String> cellHashes = cellsMatchingTimestamp.get(rowHash);
                            if (cellHashes == null) {
                                cellHashes = new HashSet<>();
                                cellsMatchingTimestamp.put(rowHash, cellHashes);
                            }
                            cellHashes.add(new String(cellValue, StandardCharsets.UTF_8));
                        }
                    }
                }
                // write the row to a new FlowFile.
                FlowFile flowFile = session.create();
                flowFile = session.write(flowFile, new OutputStreamCallback() {

                    @Override
                    public void process(final OutputStream out) throws IOException {
                        serializer.serialize(rowKey, resultCells, out);
                    }
                });
                final Map<String, String> attributes = new HashMap<>();
                attributes.put("hbase.table", tableName);
                attributes.put("mime.type", "application/json");
                flowFile = session.putAllAttributes(flowFile, attributes);
                session.getProvenanceReporter().receive(flowFile, hBaseClientService.toTransitUri(tableName, rowKeyString));
                session.transfer(flowFile, REL_SUCCESS);
                getLogger().debug("Received {} from HBase with row key {}", new Object[] { flowFile, rowKeyString });
                // we could potentially have a huge number of rows. If we get to 500, go ahead and commit the
                // session so that we can avoid buffering tons of FlowFiles without ever sending any out.
                long rowsPulled = rowsPulledHolder.get();
                rowsPulledHolder.set(++rowsPulled);
                if (++rowsPulled % getBatchSize() == 0) {
                    session.commit();
                }
            }
        });
        final ScanResult scanResults = new ScanResult(latestTimestampHolder.get(), cellsMatchingTimestamp);
        // Commit session before we replace the lastResult; if session commit fails, we want
        // to pull these records again.
        session.commit();
        if (lastResult == null || scanResults.getTimestamp() > lastResult.getTimestamp()) {
            lastResult = scanResults;
        } else if (scanResults.getTimestamp() == lastResult.getTimestamp()) {
            final Map<String, Set<String>> combinedResults = new HashMap<>(scanResults.getMatchingCells());
            // do a deep copy because the Set may be modified below.
            for (final Map.Entry<String, Set<String>> entry : scanResults.getMatchingCells().entrySet()) {
                combinedResults.put(entry.getKey(), new HashSet<>(entry.getValue()));
            }
            // combined the results from 'lastResult'
            for (final Map.Entry<String, Set<String>> entry : lastResult.getMatchingCells().entrySet()) {
                final Set<String> existing = combinedResults.get(entry.getKey());
                if (existing == null) {
                    combinedResults.put(entry.getKey(), new HashSet<>(entry.getValue()));
                } else {
                    existing.addAll(entry.getValue());
                }
            }
            final ScanResult scanResult = new ScanResult(scanResults.getTimestamp(), combinedResults);
            lastResult = scanResult;
        }
        // save state using the framework's state manager
        storeState(lastResult, context.getStateManager());
    } catch (final IOException e) {
        getLogger().error("Failed to receive data from HBase due to {}", e);
        session.rollback();
    } finally {
        // if we failed, we want to yield so that we don't hammer hbase. If we succeed, then we have
        // pulled all of the records, so we want to wait a bit before hitting hbase again anyway.
        context.yield();
    }
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) ResultHandler(org.apache.nifi.hbase.scan.ResultHandler) ResultCell(org.apache.nifi.hbase.scan.ResultCell) JsonRowSerializer(org.apache.nifi.hbase.io.JsonRowSerializer) RowSerializer(org.apache.nifi.hbase.io.RowSerializer) OutputStreamCallback(org.apache.nifi.processor.io.OutputStreamCallback) HashSet(java.util.HashSet) FlowFile(org.apache.nifi.flowfile.FlowFile) Charset(java.nio.charset.Charset) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) JsonRowSerializer(org.apache.nifi.hbase.io.JsonRowSerializer) Map(java.util.Map) StateMap(org.apache.nifi.components.state.StateMap) HashMap(java.util.HashMap)

Aggregations

HashMap (java.util.HashMap)2 FlowFile (org.apache.nifi.flowfile.FlowFile)2 RowSerializer (org.apache.nifi.hbase.io.RowSerializer)2 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 Charset (java.nio.charset.Charset)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Set (java.util.Set)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 StateMap (org.apache.nifi.components.state.StateMap)1 JsonFullRowSerializer (org.apache.nifi.hbase.io.JsonFullRowSerializer)1 JsonQualifierAndValueRowSerializer (org.apache.nifi.hbase.io.JsonQualifierAndValueRowSerializer)1 JsonRowSerializer (org.apache.nifi.hbase.io.JsonRowSerializer)1 Column (org.apache.nifi.hbase.scan.Column)1 ResultCell (org.apache.nifi.hbase.scan.ResultCell)1 ResultHandler (org.apache.nifi.hbase.scan.ResultHandler)1 ProcessException (org.apache.nifi.processor.exception.ProcessException)1 OutputStreamCallback (org.apache.nifi.processor.io.OutputStreamCallback)1