use of datawave.ingest.mapreduce.handler.DataTypeHandler in project datawave by NationalSecurityAgency.
the class EventMapper method map.
public void map(K1 key, V1 value, Context context) throws IOException, InterruptedException {
TraceStopwatch eventMapperTimer = null;
if (metricsEnabled) {
eventMapperTimer = new TraceStopwatch("Time in EventMapper");
eventMapperTimer.start();
}
// ensure this datatype's handlers etc are loaded such that the dataTypeDiscardIntervalCache and validators are filled as well
List<DataTypeHandler<K1>> typeHandlers = loadDataType(value.getDataType().typeName(), context);
// This is a little bit fragile, but there is no other way
// to get the context on a partitioner, and we are only
// using this to set some counters that collect stats.
MultiTableRangePartitioner.setContext(context);
Long myInterval = dataTypeDiscardIntervalCache.get(value.getDataType().typeName());
// setup the configuration on the event
// this is automatically done by the sequence reader....
// value.setConf(context.getConfiguration());
// Flag to control whether a reprocessed event caused an NDC.push
boolean reprocessedNDCPush = false;
byte[] rawData = value.getRawData();
if (rawData != null) {
long rawDataBytes = rawData.length;
getCounter(context, IngestInput.LINE_BYTES.toString(), "TOTAL").increment(rawDataBytes);
long minBytes = getCounter(context, IngestInput.LINE_BYTES.toString(), "MIN").getValue();
if (rawDataBytes < minBytes) {
getCounter(context, IngestInput.LINE_BYTES.toString(), "MIN").setValue(rawDataBytes);
}
long maxBytes = getCounter(context, IngestInput.LINE_BYTES.toString(), "MAX").getValue();
if (rawDataBytes > maxBytes) {
getCounter(context, IngestInput.LINE_BYTES.toString(), "MAX").setValue(rawDataBytes);
}
}
// First lets clear this event from the error table if we are reprocessing a previously errored event
if (value.getAuxData() instanceof EventErrorSummary) {
EventErrorSummary errorSummary = (EventErrorSummary) (value.getAuxData());
value.setAuxData(null);
// pass the processedCount through via the aux properties
value.setAuxProperty(ErrorDataTypeHandler.PROCESSED_COUNT, Integer.toString(errorSummary.getProcessedCount() + 1));
// delete these keys from the error table. If this fails then nothing will have changed
if (log.isInfoEnabled())
log.info("Purging event from the " + errorSummary.getTableName() + " table");
try {
// Load error dataType into typeMap
loadDataType(TypeRegistry.ERROR_PREFIX, context);
// purge event
errorSummary.purge(contextWriter, context, value, typeMap);
// Set the original file value from the event in the error table
Collection<String> origFiles = errorSummary.getEventFields().get(SEQUENCE_FILE_FIELDNAME);
if (!origFiles.isEmpty()) {
NDC.push(origFiles.iterator().next());
reprocessedNDCPush = true;
}
} catch (Exception e) {
contextWriter.rollback();
log.error("Failed to clean event from error table. Terminating map", e);
throw new IOException("Failed to clean event from error table, Terminating map", e);
} finally {
contextWriter.commit(context);
context.progress();
}
} else {
// pass the processedCount through via the aux properties
value.setAuxProperty(ErrorDataTypeHandler.PROCESSED_COUNT, "1");
}
// Determine whether the event date is greater than the interval. Excluding fatal error events.
if (!value.fatalError() && null != myInterval && 0L != myInterval && (value.getDate() < (now.get() - myInterval))) {
if (log.isInfoEnabled())
log.info("Event with time " + value.getDate() + " older than specified interval of " + (now.get() - myInterval) + ", skipping...");
getCounter(context, IngestInput.OLD_EVENT).increment(1);
return;
}
// Add the list of handlers with the ALL specified handlers
List<DataTypeHandler<K1>> handlers = new ArrayList<>();
handlers.addAll(typeHandlers);
handlers.addAll(loadDataType(TypeRegistry.ALL_PREFIX, context));
// Always include any event errors in the counters
for (String error : value.getErrors()) {
getCounter(context, IngestInput.EVENT_ERROR_TYPE.name(), error).increment(1);
}
// switch over to the errorHandlerList if still a fatal error
if (value.fatalError()) {
// now clear out the handlers to avoid processing this event
handlers.clear();
if (!value.ignorableError()) {
// since this is not an ignorable error, lets add the error handlers back into the list
handlers.addAll(loadDataType(TypeRegistry.ERROR_PREFIX, context));
getCounter(context, IngestInput.EVENT_FATAL_ERROR).increment(1);
getCounter(context, IngestInput.EVENT_FATAL_ERROR.name(), "ValidationError").increment(1);
} else {
getCounter(context, IngestInput.EVENT_IGNORABLE_ERROR).increment(1);
getCounter(context, IngestInput.EVENT_IGNORABLE_ERROR.name(), "IgnorableError").increment(1);
}
context.progress();
}
Multimap<String, NormalizedContentInterface> fields = HashMultimap.create();
try {
processEvent(key, value, handlers, fields, context);
} catch (Exception e) {
// Rollback anything written for this event
contextWriter.rollback();
// Fail job on constraint violations
if (e instanceof ConstraintChecker.ConstraintViolationException) {
throw ((RuntimeException) e);
}
// ensure they know we are still working on it
context.progress();
// log error
log.error("Runtime exception processing event", e);
// first set the exception on the event if not a field normalization error in which case the fields contain the errors
if (!(e instanceof FieldNormalizationError)) {
value.setAuxData(e);
}
for (DataTypeHandler<K1> handler : loadDataType(TypeRegistry.ERROR_PREFIX, context)) {
if (log.isTraceEnabled())
log.trace("executing handler: " + handler.getClass().getName());
try {
executeHandler(key, value, fields, handler, context);
context.progress();
} catch (Exception e2) {
// This is a real bummer, we had a critical exception attempting to throw the event into the error table.
// lets terminate this job
log.error("Failed to process error data handlers for an event", e2);
throw new IOException("Failed to process error data handlers for an event", e2);
}
}
// now create some counters
getCounter(context, IngestProcess.RUNTIME_EXCEPTION).increment(1);
List<String> exceptions = getExceptionSynopsis(e);
for (String exception : exceptions) {
getCounter(context, IngestProcess.RUNTIME_EXCEPTION.name(), exception).increment(1);
}
} finally {
// Remove ORIG_FILE from NDC that was populated by reprocessing events from the error tables
if (reprocessedNDCPush) {
NDC.pop();
}
// cleanup the context writer
contextWriter.commit(context);
context.progress();
}
getCounter(context, IngestOutput.EVENTS_PROCESSED.name(), value.getDataType().typeName().toUpperCase()).increment(1);
offset++;
if (metricsEnabled && eventMapperTimer != null) {
eventMapperTimer.stop();
long timeInEventMapper = eventMapperTimer.elapsed(TimeUnit.MILLISECONDS);
metricsLabels.clear();
metricsLabels.put("dataType", value.getDataType().typeName());
metricsService.collect(Metric.MILLIS_IN_EVENT_MAPPER, metricsLabels.get(), fields, timeInEventMapper);
}
}
use of datawave.ingest.mapreduce.handler.DataTypeHandler in project datawave by NationalSecurityAgency.
the class EventMapper method loadDataType.
/**
* Get the data type handlers for a given type name. This will also fill the dataTypeDiscardIntervalCache and the validators as a side effect.
*
* @return the data type handlers
*/
private List<DataTypeHandler<K1>> loadDataType(String typeStr, Context context) {
// Do not load the type twice
if (!typeMap.containsKey(typeStr)) {
typeMap.put(typeStr, new ArrayList<>());
long myInterval = context.getConfiguration().getLong(typeStr + "." + DISCARD_INTERVAL, interval);
dataTypeDiscardIntervalCache.put(typeStr, myInterval);
log.info("Setting up type: " + typeStr + " with interval " + myInterval);
if (!TypeRegistry.getTypeNames().contains(typeStr)) {
log.warn("Attempted to load configuration for a type that does not exist in the registry: " + typeStr);
} else {
Type t = TypeRegistry.getType(typeStr);
String fieldValidators = context.getConfiguration().get(typeStr + FieldValidator.FIELD_VALIDATOR_NAMES);
if (fieldValidators != null) {
String[] validatorClasses = StringUtils.split(fieldValidators, ",");
for (String validatorClass : validatorClasses) {
try {
Class<? extends FieldValidator> clazz = Class.forName(validatorClass).asSubclass(FieldValidator.class);
FieldValidator validator = clazz.newInstance();
validator.init(t, context.getConfiguration());
validators.put(typeStr, validator);
} catch (ClassNotFoundException e) {
log.error("Error finding validator " + validatorClass, e);
} catch (InstantiationException | IllegalAccessException e) {
log.error("Error creating validator " + validatorClass, e);
}
}
}
String[] handlerClassNames = t.getDefaultDataTypeHandlers();
if (handlerClassNames != null) {
for (String handlerClassName : handlerClassNames) {
log.info("Configuring handler: " + handlerClassName);
try {
@SuppressWarnings("unchecked") Class<? extends DataTypeHandler<K1>> clazz = (Class<? extends DataTypeHandler<K1>>) Class.forName(handlerClassName);
DataTypeHandler<K1> h = clazz.newInstance();
// Create a counter initialized to zero for all handler types.
getCounter(context, IngestOutput.ROWS_CREATED.name(), h.getClass().getSimpleName()).increment(0);
// Trick here. Set the data.name parameter to type T, then call setup on the DataTypeHandler
Configuration clone = new Configuration(context.getConfiguration());
clone.set(DataTypeHelper.Properties.DATA_NAME, t.typeName());
// Use the StandaloneReporter and StandaloneTaskAttemptContext for the Handlers. Because the StandaloneTaskAttemptContext
// is a subclass of TaskInputOutputContext and TaskAttemptContext is not. We are using this to record the counters during
// processing. We will need to add the counters in the StandaloneReporter to the Map.Context in the close call.
// TaskAttemptContext newContext = new TaskAttemptContext(clone, context.getTaskAttemptID());
StandaloneTaskAttemptContext<K1, V1, K2, V2> newContext = new StandaloneTaskAttemptContext<>(clone, context.getTaskAttemptID(), reporter);
h.setup(newContext);
typeMap.get(typeStr).add(h);
} catch (ClassNotFoundException e) {
log.error("Error finding DataTypeHandler " + handlerClassName, e);
} catch (InstantiationException | IllegalAccessException e) {
log.error("Error creating DataTypeHandler " + handlerClassName, e);
}
}
}
}
log.info("EventMapper configured with the following handlers for " + typeStr + ": " + typeMap.get(typeStr));
}
return typeMap.get(typeStr);
}
use of datawave.ingest.mapreduce.handler.DataTypeHandler in project datawave by NationalSecurityAgency.
the class EventMapper method cleanup.
@Override
public void cleanup(Context context) throws IOException, InterruptedException {
// Write the metadata to the output
for (List<DataTypeHandler<K1>> handlers : typeMap.values()) {
for (DataTypeHandler<K1> h : handlers) if (h.getMetadata() != null) {
try {
contextWriter.write(h.getMetadata().getBulkMetadata(), context);
} finally {
contextWriter.commit(context);
}
}
}
// dump any unflushed metrics
if (metricsEnabled) {
metricsService.close();
}
// cleanup the context writer
contextWriter.cleanup(context);
for (List<DataTypeHandler<K1>> handlers : typeMap.values()) {
for (DataTypeHandler<K1> h : handlers) h.close(context);
}
typeMap.clear();
// Add the counters from the standalone reporter to this context.
Counters counters = reporter.getCounters();
for (CounterGroup cg : counters) {
for (Counter c : cg) {
getCounter(context, cg.getName(), c.getName()).increment(c.getValue());
}
}
super.cleanup(context);
// we pushed the filename on the NDC if split is non null, so pop it here.
if (null != split) {
NDC.pop();
}
}
Aggregations