Search in sources :

Example 1 with JsonRowSerializer

use of org.apache.nifi.hbase.io.JsonRowSerializer in project nifi by apache.

the class GetHBase method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    final String tableName = context.getProperty(TABLE_NAME).getValue();
    final String initialTimeRange = context.getProperty(INITIAL_TIMERANGE).getValue();
    final String filterExpression = context.getProperty(FILTER_EXPRESSION).getValue();
    final HBaseClientService hBaseClientService = context.getProperty(HBASE_CLIENT_SERVICE).asControllerService(HBaseClientService.class);
    // if the table was changed then remove any previous state
    if (previousTable != null && !tableName.equals(previousTable)) {
        try {
            context.getStateManager().clear(Scope.CLUSTER);
        } catch (final IOException ioe) {
            getLogger().warn("Failed to clear Cluster State", ioe);
        }
        previousTable = tableName;
    }
    try {
        final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
        final RowSerializer serializer = new JsonRowSerializer(charset);
        this.lastResult = getState(context.getStateManager());
        final long defaultMinTime = (initialTimeRange.equals(NONE.getValue()) ? 0L : System.currentTimeMillis());
        final long minTime = (lastResult == null ? defaultMinTime : lastResult.getTimestamp());
        final Map<String, Set<String>> cellsMatchingTimestamp = new HashMap<>();
        final AtomicReference<Long> rowsPulledHolder = new AtomicReference<>(0L);
        final AtomicReference<Long> latestTimestampHolder = new AtomicReference<>(minTime);
        hBaseClientService.scan(tableName, columns, filterExpression, minTime, new ResultHandler() {

            @Override
            public void handle(final byte[] rowKey, final ResultCell[] resultCells) {
                final String rowKeyString = new String(rowKey, StandardCharsets.UTF_8);
                // check if latest cell timestamp is equal to our cutoff.
                // if any of the cells have a timestamp later than our cutoff, then we
                // want the row. But if the cell with the latest timestamp is equal to
                // our cutoff, then we want to check if that's one of the cells that
                // we have already seen.
                long latestCellTimestamp = 0L;
                for (final ResultCell cell : resultCells) {
                    if (cell.getTimestamp() > latestCellTimestamp) {
                        latestCellTimestamp = cell.getTimestamp();
                    }
                }
                // we've already seen this.
                if (latestCellTimestamp < minTime) {
                    getLogger().debug("latest cell timestamp for row {} is {}, which is earlier than the minimum time of {}", new Object[] { rowKeyString, latestCellTimestamp, minTime });
                    return;
                }
                if (latestCellTimestamp == minTime) {
                    // latest cell timestamp is equal to our minimum time. Check if all cells that have
                    // that timestamp are in our list of previously seen cells.
                    boolean allSeen = true;
                    for (final ResultCell cell : resultCells) {
                        if (cell.getTimestamp() == latestCellTimestamp) {
                            if (lastResult == null || !lastResult.contains(cell)) {
                                allSeen = false;
                                break;
                            }
                        }
                    }
                    if (allSeen) {
                        // we have already seen all of the cells for this row. We do not want to
                        // include this cell in our output.
                        getLogger().debug("all cells for row {} have already been seen", new Object[] { rowKeyString });
                        return;
                    }
                }
                // we can ignore these cells.
                if (latestCellTimestamp >= latestTimestampHolder.get()) {
                    // new timestamp, so clear all of the 'matching cells'
                    if (latestCellTimestamp > latestTimestampHolder.get()) {
                        latestTimestampHolder.set(latestCellTimestamp);
                        cellsMatchingTimestamp.clear();
                    }
                    for (final ResultCell cell : resultCells) {
                        final long ts = cell.getTimestamp();
                        if (ts == latestCellTimestamp) {
                            final byte[] rowValue = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength() + cell.getRowOffset());
                            final byte[] cellValue = Arrays.copyOfRange(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength() + cell.getValueOffset());
                            final String rowHash = new String(rowValue, StandardCharsets.UTF_8);
                            Set<String> cellHashes = cellsMatchingTimestamp.get(rowHash);
                            if (cellHashes == null) {
                                cellHashes = new HashSet<>();
                                cellsMatchingTimestamp.put(rowHash, cellHashes);
                            }
                            cellHashes.add(new String(cellValue, StandardCharsets.UTF_8));
                        }
                    }
                }
                // write the row to a new FlowFile.
                FlowFile flowFile = session.create();
                flowFile = session.write(flowFile, new OutputStreamCallback() {

                    @Override
                    public void process(final OutputStream out) throws IOException {
                        serializer.serialize(rowKey, resultCells, out);
                    }
                });
                final Map<String, String> attributes = new HashMap<>();
                attributes.put("hbase.table", tableName);
                attributes.put("mime.type", "application/json");
                flowFile = session.putAllAttributes(flowFile, attributes);
                session.getProvenanceReporter().receive(flowFile, hBaseClientService.toTransitUri(tableName, rowKeyString));
                session.transfer(flowFile, REL_SUCCESS);
                getLogger().debug("Received {} from HBase with row key {}", new Object[] { flowFile, rowKeyString });
                // we could potentially have a huge number of rows. If we get to 500, go ahead and commit the
                // session so that we can avoid buffering tons of FlowFiles without ever sending any out.
                long rowsPulled = rowsPulledHolder.get();
                rowsPulledHolder.set(++rowsPulled);
                if (++rowsPulled % getBatchSize() == 0) {
                    session.commit();
                }
            }
        });
        final ScanResult scanResults = new ScanResult(latestTimestampHolder.get(), cellsMatchingTimestamp);
        // Commit session before we replace the lastResult; if session commit fails, we want
        // to pull these records again.
        session.commit();
        if (lastResult == null || scanResults.getTimestamp() > lastResult.getTimestamp()) {
            lastResult = scanResults;
        } else if (scanResults.getTimestamp() == lastResult.getTimestamp()) {
            final Map<String, Set<String>> combinedResults = new HashMap<>(scanResults.getMatchingCells());
            // do a deep copy because the Set may be modified below.
            for (final Map.Entry<String, Set<String>> entry : scanResults.getMatchingCells().entrySet()) {
                combinedResults.put(entry.getKey(), new HashSet<>(entry.getValue()));
            }
            // combined the results from 'lastResult'
            for (final Map.Entry<String, Set<String>> entry : lastResult.getMatchingCells().entrySet()) {
                final Set<String> existing = combinedResults.get(entry.getKey());
                if (existing == null) {
                    combinedResults.put(entry.getKey(), new HashSet<>(entry.getValue()));
                } else {
                    existing.addAll(entry.getValue());
                }
            }
            final ScanResult scanResult = new ScanResult(scanResults.getTimestamp(), combinedResults);
            lastResult = scanResult;
        }
        // save state using the framework's state manager
        storeState(lastResult, context.getStateManager());
    } catch (final IOException e) {
        getLogger().error("Failed to receive data from HBase due to {}", e);
        session.rollback();
    } finally {
        // if we failed, we want to yield so that we don't hammer hbase. If we succeed, then we have
        // pulled all of the records, so we want to wait a bit before hitting hbase again anyway.
        context.yield();
    }
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) ResultHandler(org.apache.nifi.hbase.scan.ResultHandler) ResultCell(org.apache.nifi.hbase.scan.ResultCell) JsonRowSerializer(org.apache.nifi.hbase.io.JsonRowSerializer) RowSerializer(org.apache.nifi.hbase.io.RowSerializer) OutputStreamCallback(org.apache.nifi.processor.io.OutputStreamCallback) HashSet(java.util.HashSet) FlowFile(org.apache.nifi.flowfile.FlowFile) Charset(java.nio.charset.Charset) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) JsonRowSerializer(org.apache.nifi.hbase.io.JsonRowSerializer) Map(java.util.Map) StateMap(org.apache.nifi.components.state.StateMap) HashMap(java.util.HashMap)

Aggregations

IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 Charset (java.nio.charset.Charset)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Set (java.util.Set)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 StateMap (org.apache.nifi.components.state.StateMap)1 FlowFile (org.apache.nifi.flowfile.FlowFile)1 JsonRowSerializer (org.apache.nifi.hbase.io.JsonRowSerializer)1 RowSerializer (org.apache.nifi.hbase.io.RowSerializer)1 ResultCell (org.apache.nifi.hbase.scan.ResultCell)1 ResultHandler (org.apache.nifi.hbase.scan.ResultHandler)1 OutputStreamCallback (org.apache.nifi.processor.io.OutputStreamCallback)1