use of org.apache.nifi.hbase.io.JsonRowSerializer in project nifi by apache.
the class GetHBase method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final String tableName = context.getProperty(TABLE_NAME).getValue();
final String initialTimeRange = context.getProperty(INITIAL_TIMERANGE).getValue();
final String filterExpression = context.getProperty(FILTER_EXPRESSION).getValue();
final HBaseClientService hBaseClientService = context.getProperty(HBASE_CLIENT_SERVICE).asControllerService(HBaseClientService.class);
// if the table was changed then remove any previous state
if (previousTable != null && !tableName.equals(previousTable)) {
try {
context.getStateManager().clear(Scope.CLUSTER);
} catch (final IOException ioe) {
getLogger().warn("Failed to clear Cluster State", ioe);
}
previousTable = tableName;
}
try {
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
final RowSerializer serializer = new JsonRowSerializer(charset);
this.lastResult = getState(context.getStateManager());
final long defaultMinTime = (initialTimeRange.equals(NONE.getValue()) ? 0L : System.currentTimeMillis());
final long minTime = (lastResult == null ? defaultMinTime : lastResult.getTimestamp());
final Map<String, Set<String>> cellsMatchingTimestamp = new HashMap<>();
final AtomicReference<Long> rowsPulledHolder = new AtomicReference<>(0L);
final AtomicReference<Long> latestTimestampHolder = new AtomicReference<>(minTime);
hBaseClientService.scan(tableName, columns, filterExpression, minTime, new ResultHandler() {
@Override
public void handle(final byte[] rowKey, final ResultCell[] resultCells) {
final String rowKeyString = new String(rowKey, StandardCharsets.UTF_8);
// check if latest cell timestamp is equal to our cutoff.
// if any of the cells have a timestamp later than our cutoff, then we
// want the row. But if the cell with the latest timestamp is equal to
// our cutoff, then we want to check if that's one of the cells that
// we have already seen.
long latestCellTimestamp = 0L;
for (final ResultCell cell : resultCells) {
if (cell.getTimestamp() > latestCellTimestamp) {
latestCellTimestamp = cell.getTimestamp();
}
}
// we've already seen this.
if (latestCellTimestamp < minTime) {
getLogger().debug("latest cell timestamp for row {} is {}, which is earlier than the minimum time of {}", new Object[] { rowKeyString, latestCellTimestamp, minTime });
return;
}
if (latestCellTimestamp == minTime) {
// latest cell timestamp is equal to our minimum time. Check if all cells that have
// that timestamp are in our list of previously seen cells.
boolean allSeen = true;
for (final ResultCell cell : resultCells) {
if (cell.getTimestamp() == latestCellTimestamp) {
if (lastResult == null || !lastResult.contains(cell)) {
allSeen = false;
break;
}
}
}
if (allSeen) {
// we have already seen all of the cells for this row. We do not want to
// include this cell in our output.
getLogger().debug("all cells for row {} have already been seen", new Object[] { rowKeyString });
return;
}
}
// we can ignore these cells.
if (latestCellTimestamp >= latestTimestampHolder.get()) {
// new timestamp, so clear all of the 'matching cells'
if (latestCellTimestamp > latestTimestampHolder.get()) {
latestTimestampHolder.set(latestCellTimestamp);
cellsMatchingTimestamp.clear();
}
for (final ResultCell cell : resultCells) {
final long ts = cell.getTimestamp();
if (ts == latestCellTimestamp) {
final byte[] rowValue = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength() + cell.getRowOffset());
final byte[] cellValue = Arrays.copyOfRange(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength() + cell.getValueOffset());
final String rowHash = new String(rowValue, StandardCharsets.UTF_8);
Set<String> cellHashes = cellsMatchingTimestamp.get(rowHash);
if (cellHashes == null) {
cellHashes = new HashSet<>();
cellsMatchingTimestamp.put(rowHash, cellHashes);
}
cellHashes.add(new String(cellValue, StandardCharsets.UTF_8));
}
}
}
// write the row to a new FlowFile.
FlowFile flowFile = session.create();
flowFile = session.write(flowFile, new OutputStreamCallback() {
@Override
public void process(final OutputStream out) throws IOException {
serializer.serialize(rowKey, resultCells, out);
}
});
final Map<String, String> attributes = new HashMap<>();
attributes.put("hbase.table", tableName);
attributes.put("mime.type", "application/json");
flowFile = session.putAllAttributes(flowFile, attributes);
session.getProvenanceReporter().receive(flowFile, hBaseClientService.toTransitUri(tableName, rowKeyString));
session.transfer(flowFile, REL_SUCCESS);
getLogger().debug("Received {} from HBase with row key {}", new Object[] { flowFile, rowKeyString });
// we could potentially have a huge number of rows. If we get to 500, go ahead and commit the
// session so that we can avoid buffering tons of FlowFiles without ever sending any out.
long rowsPulled = rowsPulledHolder.get();
rowsPulledHolder.set(++rowsPulled);
if (++rowsPulled % getBatchSize() == 0) {
session.commit();
}
}
});
final ScanResult scanResults = new ScanResult(latestTimestampHolder.get(), cellsMatchingTimestamp);
// Commit session before we replace the lastResult; if session commit fails, we want
// to pull these records again.
session.commit();
if (lastResult == null || scanResults.getTimestamp() > lastResult.getTimestamp()) {
lastResult = scanResults;
} else if (scanResults.getTimestamp() == lastResult.getTimestamp()) {
final Map<String, Set<String>> combinedResults = new HashMap<>(scanResults.getMatchingCells());
// do a deep copy because the Set may be modified below.
for (final Map.Entry<String, Set<String>> entry : scanResults.getMatchingCells().entrySet()) {
combinedResults.put(entry.getKey(), new HashSet<>(entry.getValue()));
}
// combined the results from 'lastResult'
for (final Map.Entry<String, Set<String>> entry : lastResult.getMatchingCells().entrySet()) {
final Set<String> existing = combinedResults.get(entry.getKey());
if (existing == null) {
combinedResults.put(entry.getKey(), new HashSet<>(entry.getValue()));
} else {
existing.addAll(entry.getValue());
}
}
final ScanResult scanResult = new ScanResult(scanResults.getTimestamp(), combinedResults);
lastResult = scanResult;
}
// save state using the framework's state manager
storeState(lastResult, context.getStateManager());
} catch (final IOException e) {
getLogger().error("Failed to receive data from HBase due to {}", e);
session.rollback();
} finally {
// if we failed, we want to yield so that we don't hammer hbase. If we succeed, then we have
// pulled all of the records, so we want to wait a bit before hitting hbase again anyway.
context.yield();
}
}
Aggregations