Search in sources :

Example 1 with DataTypeHandler

use of datawave.ingest.mapreduce.handler.DataTypeHandler in project datawave by NationalSecurityAgency.

the class EventMapper method map.

public void map(K1 key, V1 value, Context context) throws IOException, InterruptedException {
    TraceStopwatch eventMapperTimer = null;
    if (metricsEnabled) {
        eventMapperTimer = new TraceStopwatch("Time in EventMapper");
        eventMapperTimer.start();
    }
    // ensure this datatype's handlers etc are loaded such that the dataTypeDiscardIntervalCache and validators are filled as well
    List<DataTypeHandler<K1>> typeHandlers = loadDataType(value.getDataType().typeName(), context);
    // This is a little bit fragile, but there is no other way
    // to get the context on a partitioner, and we are only
    // using this to set some counters that collect stats.
    MultiTableRangePartitioner.setContext(context);
    Long myInterval = dataTypeDiscardIntervalCache.get(value.getDataType().typeName());
    // setup the configuration on the event
    // this is automatically done by the sequence reader....
    // value.setConf(context.getConfiguration());
    // Flag to control whether a reprocessed event caused an NDC.push
    boolean reprocessedNDCPush = false;
    byte[] rawData = value.getRawData();
    if (rawData != null) {
        long rawDataBytes = rawData.length;
        getCounter(context, IngestInput.LINE_BYTES.toString(), "TOTAL").increment(rawDataBytes);
        long minBytes = getCounter(context, IngestInput.LINE_BYTES.toString(), "MIN").getValue();
        if (rawDataBytes < minBytes) {
            getCounter(context, IngestInput.LINE_BYTES.toString(), "MIN").setValue(rawDataBytes);
        }
        long maxBytes = getCounter(context, IngestInput.LINE_BYTES.toString(), "MAX").getValue();
        if (rawDataBytes > maxBytes) {
            getCounter(context, IngestInput.LINE_BYTES.toString(), "MAX").setValue(rawDataBytes);
        }
    }
    // First lets clear this event from the error table if we are reprocessing a previously errored event
    if (value.getAuxData() instanceof EventErrorSummary) {
        EventErrorSummary errorSummary = (EventErrorSummary) (value.getAuxData());
        value.setAuxData(null);
        // pass the processedCount through via the aux properties
        value.setAuxProperty(ErrorDataTypeHandler.PROCESSED_COUNT, Integer.toString(errorSummary.getProcessedCount() + 1));
        // delete these keys from the error table. If this fails then nothing will have changed
        if (log.isInfoEnabled())
            log.info("Purging event from the " + errorSummary.getTableName() + " table");
        try {
            // Load error dataType into typeMap
            loadDataType(TypeRegistry.ERROR_PREFIX, context);
            // purge event
            errorSummary.purge(contextWriter, context, value, typeMap);
            // Set the original file value from the event in the error table
            Collection<String> origFiles = errorSummary.getEventFields().get(SEQUENCE_FILE_FIELDNAME);
            if (!origFiles.isEmpty()) {
                NDC.push(origFiles.iterator().next());
                reprocessedNDCPush = true;
            }
        } catch (Exception e) {
            contextWriter.rollback();
            log.error("Failed to clean event from error table.  Terminating map", e);
            throw new IOException("Failed to clean event from error table, Terminating map", e);
        } finally {
            contextWriter.commit(context);
            context.progress();
        }
    } else {
        // pass the processedCount through via the aux properties
        value.setAuxProperty(ErrorDataTypeHandler.PROCESSED_COUNT, "1");
    }
    // Determine whether the event date is greater than the interval. Excluding fatal error events.
    if (!value.fatalError() && null != myInterval && 0L != myInterval && (value.getDate() < (now.get() - myInterval))) {
        if (log.isInfoEnabled())
            log.info("Event with time " + value.getDate() + " older than specified interval of " + (now.get() - myInterval) + ", skipping...");
        getCounter(context, IngestInput.OLD_EVENT).increment(1);
        return;
    }
    // Add the list of handlers with the ALL specified handlers
    List<DataTypeHandler<K1>> handlers = new ArrayList<>();
    handlers.addAll(typeHandlers);
    handlers.addAll(loadDataType(TypeRegistry.ALL_PREFIX, context));
    // Always include any event errors in the counters
    for (String error : value.getErrors()) {
        getCounter(context, IngestInput.EVENT_ERROR_TYPE.name(), error).increment(1);
    }
    // switch over to the errorHandlerList if still a fatal error
    if (value.fatalError()) {
        // now clear out the handlers to avoid processing this event
        handlers.clear();
        if (!value.ignorableError()) {
            // since this is not an ignorable error, lets add the error handlers back into the list
            handlers.addAll(loadDataType(TypeRegistry.ERROR_PREFIX, context));
            getCounter(context, IngestInput.EVENT_FATAL_ERROR).increment(1);
            getCounter(context, IngestInput.EVENT_FATAL_ERROR.name(), "ValidationError").increment(1);
        } else {
            getCounter(context, IngestInput.EVENT_IGNORABLE_ERROR).increment(1);
            getCounter(context, IngestInput.EVENT_IGNORABLE_ERROR.name(), "IgnorableError").increment(1);
        }
        context.progress();
    }
    Multimap<String, NormalizedContentInterface> fields = HashMultimap.create();
    try {
        processEvent(key, value, handlers, fields, context);
    } catch (Exception e) {
        // Rollback anything written for this event
        contextWriter.rollback();
        // Fail job on constraint violations
        if (e instanceof ConstraintChecker.ConstraintViolationException) {
            throw ((RuntimeException) e);
        }
        // ensure they know we are still working on it
        context.progress();
        // log error
        log.error("Runtime exception processing event", e);
        // first set the exception on the event if not a field normalization error in which case the fields contain the errors
        if (!(e instanceof FieldNormalizationError)) {
            value.setAuxData(e);
        }
        for (DataTypeHandler<K1> handler : loadDataType(TypeRegistry.ERROR_PREFIX, context)) {
            if (log.isTraceEnabled())
                log.trace("executing handler: " + handler.getClass().getName());
            try {
                executeHandler(key, value, fields, handler, context);
                context.progress();
            } catch (Exception e2) {
                // This is a real bummer, we had a critical exception attempting to throw the event into the error table.
                // lets terminate this job
                log.error("Failed to process error data handlers for an event", e2);
                throw new IOException("Failed to process error data handlers for an event", e2);
            }
        }
        // now create some counters
        getCounter(context, IngestProcess.RUNTIME_EXCEPTION).increment(1);
        List<String> exceptions = getExceptionSynopsis(e);
        for (String exception : exceptions) {
            getCounter(context, IngestProcess.RUNTIME_EXCEPTION.name(), exception).increment(1);
        }
    } finally {
        // Remove ORIG_FILE from NDC that was populated by reprocessing events from the error tables
        if (reprocessedNDCPush) {
            NDC.pop();
        }
        // cleanup the context writer
        contextWriter.commit(context);
        context.progress();
    }
    getCounter(context, IngestOutput.EVENTS_PROCESSED.name(), value.getDataType().typeName().toUpperCase()).increment(1);
    offset++;
    if (metricsEnabled && eventMapperTimer != null) {
        eventMapperTimer.stop();
        long timeInEventMapper = eventMapperTimer.elapsed(TimeUnit.MILLISECONDS);
        metricsLabels.clear();
        metricsLabels.put("dataType", value.getDataType().typeName());
        metricsService.collect(Metric.MILLIS_IN_EVENT_MAPPER, metricsLabels.get(), fields, timeInEventMapper);
    }
}
Also used : ArrayList(java.util.ArrayList) ErrorDataTypeHandler(datawave.ingest.mapreduce.handler.error.ErrorDataTypeHandler) ExtendedDataTypeHandler(datawave.ingest.mapreduce.handler.ExtendedDataTypeHandler) DataTypeHandler(datawave.ingest.mapreduce.handler.DataTypeHandler) IOException(java.io.IOException) IOException(java.io.IOException) TraceStopwatch(datawave.util.time.TraceStopwatch) EventErrorSummary(datawave.ingest.input.reader.event.EventErrorSummary) NormalizedContentInterface(datawave.ingest.data.config.NormalizedContentInterface) List(java.util.List) ArrayList(java.util.ArrayList) ConstraintChecker(datawave.ingest.mapreduce.job.ConstraintChecker)

Example 2 with DataTypeHandler

use of datawave.ingest.mapreduce.handler.DataTypeHandler in project datawave by NationalSecurityAgency.

the class EventMapper method loadDataType.

/**
 * Get the data type handlers for a given type name. This will also fill the dataTypeDiscardIntervalCache and the validators as a side effect.
 *
 * @return the data type handlers
 */
private List<DataTypeHandler<K1>> loadDataType(String typeStr, Context context) {
    // Do not load the type twice
    if (!typeMap.containsKey(typeStr)) {
        typeMap.put(typeStr, new ArrayList<>());
        long myInterval = context.getConfiguration().getLong(typeStr + "." + DISCARD_INTERVAL, interval);
        dataTypeDiscardIntervalCache.put(typeStr, myInterval);
        log.info("Setting up type: " + typeStr + " with interval " + myInterval);
        if (!TypeRegistry.getTypeNames().contains(typeStr)) {
            log.warn("Attempted to load configuration for a type that does not exist in the registry: " + typeStr);
        } else {
            Type t = TypeRegistry.getType(typeStr);
            String fieldValidators = context.getConfiguration().get(typeStr + FieldValidator.FIELD_VALIDATOR_NAMES);
            if (fieldValidators != null) {
                String[] validatorClasses = StringUtils.split(fieldValidators, ",");
                for (String validatorClass : validatorClasses) {
                    try {
                        Class<? extends FieldValidator> clazz = Class.forName(validatorClass).asSubclass(FieldValidator.class);
                        FieldValidator validator = clazz.newInstance();
                        validator.init(t, context.getConfiguration());
                        validators.put(typeStr, validator);
                    } catch (ClassNotFoundException e) {
                        log.error("Error finding validator " + validatorClass, e);
                    } catch (InstantiationException | IllegalAccessException e) {
                        log.error("Error creating validator " + validatorClass, e);
                    }
                }
            }
            String[] handlerClassNames = t.getDefaultDataTypeHandlers();
            if (handlerClassNames != null) {
                for (String handlerClassName : handlerClassNames) {
                    log.info("Configuring handler: " + handlerClassName);
                    try {
                        @SuppressWarnings("unchecked") Class<? extends DataTypeHandler<K1>> clazz = (Class<? extends DataTypeHandler<K1>>) Class.forName(handlerClassName);
                        DataTypeHandler<K1> h = clazz.newInstance();
                        // Create a counter initialized to zero for all handler types.
                        getCounter(context, IngestOutput.ROWS_CREATED.name(), h.getClass().getSimpleName()).increment(0);
                        // Trick here. Set the data.name parameter to type T, then call setup on the DataTypeHandler
                        Configuration clone = new Configuration(context.getConfiguration());
                        clone.set(DataTypeHelper.Properties.DATA_NAME, t.typeName());
                        // Use the StandaloneReporter and StandaloneTaskAttemptContext for the Handlers. Because the StandaloneTaskAttemptContext
                        // is a subclass of TaskInputOutputContext and TaskAttemptContext is not. We are using this to record the counters during
                        // processing. We will need to add the counters in the StandaloneReporter to the Map.Context in the close call.
                        // TaskAttemptContext newContext = new TaskAttemptContext(clone, context.getTaskAttemptID());
                        StandaloneTaskAttemptContext<K1, V1, K2, V2> newContext = new StandaloneTaskAttemptContext<>(clone, context.getTaskAttemptID(), reporter);
                        h.setup(newContext);
                        typeMap.get(typeStr).add(h);
                    } catch (ClassNotFoundException e) {
                        log.error("Error finding DataTypeHandler " + handlerClassName, e);
                    } catch (InstantiationException | IllegalAccessException e) {
                        log.error("Error creating DataTypeHandler " + handlerClassName, e);
                    }
                }
            }
        }
        log.info("EventMapper configured with the following handlers for " + typeStr + ": " + typeMap.get(typeStr));
    }
    return typeMap.get(typeStr);
}
Also used : StandaloneTaskAttemptContext(datawave.ingest.test.StandaloneTaskAttemptContext) Configuration(org.apache.hadoop.conf.Configuration) MetricsConfiguration(datawave.ingest.mapreduce.job.metrics.MetricsConfiguration) FieldValidator(datawave.ingest.validation.FieldValidator) ErrorDataTypeHandler(datawave.ingest.mapreduce.handler.error.ErrorDataTypeHandler) ExtendedDataTypeHandler(datawave.ingest.mapreduce.handler.ExtendedDataTypeHandler) DataTypeHandler(datawave.ingest.mapreduce.handler.DataTypeHandler) Type(datawave.ingest.data.Type)

Example 3 with DataTypeHandler

use of datawave.ingest.mapreduce.handler.DataTypeHandler in project datawave by NationalSecurityAgency.

the class EventMapper method cleanup.

@Override
public void cleanup(Context context) throws IOException, InterruptedException {
    // Write the metadata to the output
    for (List<DataTypeHandler<K1>> handlers : typeMap.values()) {
        for (DataTypeHandler<K1> h : handlers) if (h.getMetadata() != null) {
            try {
                contextWriter.write(h.getMetadata().getBulkMetadata(), context);
            } finally {
                contextWriter.commit(context);
            }
        }
    }
    // dump any unflushed metrics
    if (metricsEnabled) {
        metricsService.close();
    }
    // cleanup the context writer
    contextWriter.cleanup(context);
    for (List<DataTypeHandler<K1>> handlers : typeMap.values()) {
        for (DataTypeHandler<K1> h : handlers) h.close(context);
    }
    typeMap.clear();
    // Add the counters from the standalone reporter to this context.
    Counters counters = reporter.getCounters();
    for (CounterGroup cg : counters) {
        for (Counter c : cg) {
            getCounter(context, cg.getName(), c.getName()).increment(c.getValue());
        }
    }
    super.cleanup(context);
    // we pushed the filename on the NDC if split is non null, so pop it here.
    if (null != split) {
        NDC.pop();
    }
}
Also used : Counter(org.apache.hadoop.mapreduce.Counter) CounterGroup(org.apache.hadoop.mapreduce.CounterGroup) ErrorDataTypeHandler(datawave.ingest.mapreduce.handler.error.ErrorDataTypeHandler) ExtendedDataTypeHandler(datawave.ingest.mapreduce.handler.ExtendedDataTypeHandler) DataTypeHandler(datawave.ingest.mapreduce.handler.DataTypeHandler) Counters(org.apache.hadoop.mapreduce.Counters)

Aggregations

DataTypeHandler (datawave.ingest.mapreduce.handler.DataTypeHandler)3 ExtendedDataTypeHandler (datawave.ingest.mapreduce.handler.ExtendedDataTypeHandler)3 ErrorDataTypeHandler (datawave.ingest.mapreduce.handler.error.ErrorDataTypeHandler)3 Type (datawave.ingest.data.Type)1 NormalizedContentInterface (datawave.ingest.data.config.NormalizedContentInterface)1 EventErrorSummary (datawave.ingest.input.reader.event.EventErrorSummary)1 ConstraintChecker (datawave.ingest.mapreduce.job.ConstraintChecker)1 MetricsConfiguration (datawave.ingest.mapreduce.job.metrics.MetricsConfiguration)1 StandaloneTaskAttemptContext (datawave.ingest.test.StandaloneTaskAttemptContext)1 FieldValidator (datawave.ingest.validation.FieldValidator)1 TraceStopwatch (datawave.util.time.TraceStopwatch)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Configuration (org.apache.hadoop.conf.Configuration)1 Counter (org.apache.hadoop.mapreduce.Counter)1 CounterGroup (org.apache.hadoop.mapreduce.CounterGroup)1 Counters (org.apache.hadoop.mapreduce.Counters)1