Search in sources :

Example 1 with CompositeIngest

use of datawave.ingest.data.config.ingest.CompositeIngest in project datawave by NationalSecurityAgency.

the class ColumnBasedHandlerTestUtil method processEvent.

public static void processEvent(DataTypeHandler<Text> handler, ExtendedDataTypeHandler<Text, BulkIngestKey, Value> edgeHandler, RawRecordContainer event, int expectedShardKeys, int expectedShardIndexKeys, int expectedShardReverseIndexKeys, int expectedEdgeKeys, boolean printKeysOnlyOnFail) {
    Assert.assertNotNull("Event was null.", event);
    Multimap<String, NormalizedContentInterface> eventFields = handler.getHelper(event.getDataType()).getEventFields(event);
    VirtualIngest vHelper = (VirtualIngest) handler.getHelper(event.getDataType());
    Multimap<String, NormalizedContentInterface> virtualFields = vHelper.getVirtualFields(eventFields);
    for (Map.Entry<String, NormalizedContentInterface> v : virtualFields.entries()) {
        eventFields.put(v.getKey(), v.getValue());
    }
    if (vHelper instanceof CompositeIngest) {
        CompositeIngest compIngest = (CompositeIngest) vHelper;
        Multimap<String, NormalizedContentInterface> compositeFields = compIngest.getCompositeFields(eventFields);
        for (String fieldName : compositeFields.keySet()) {
            // if this is an overloaded event field, we are replacing the existing data
            if (compIngest.isOverloadedCompositeField(fieldName))
                eventFields.removeAll(fieldName);
            eventFields.putAll(fieldName, compositeFields.get(fieldName));
        }
    }
    Multimap<BulkIngestKey, Value> results = handler.processBulk(new Text(), event, eventFields, new MockStatusReporter());
    Set<Key> shardKeys = new HashSet<>();
    Set<Key> shardIndexKeys = new HashSet<>();
    Set<Key> shardReverseIndexKeys = new HashSet<>();
    Set<Key> edgeKeys = new HashSet<>();
    Map<Text, Integer> countMap = Maps.newHashMap();
    for (BulkIngestKey k : results.keySet()) {
        Text tableName = k.getTableName();
        if (countMap.containsKey(tableName)) {
            countMap.put(tableName, countMap.get(tableName) + 1);
        } else {
            countMap.put(tableName, 1);
        }
    }
    for (Map.Entry<BulkIngestKey, Value> e : results.entries()) {
        BulkIngestKey bik = e.getKey();
        if (log.isDebugEnabled() && isDocumentKey(bik.getKey())) {
            log.debug("Found Document Key: " + bik.getKey());
            log.debug("value:\n" + e.getValue());
        }
        if (bik.getTableName().equals(shardTableName)) {
            shardKeys.add(bik.getKey());
        } else if (bik.getTableName().equals(shardIndexTableName)) {
            shardIndexKeys.add(bik.getKey());
        } else if (bik.getTableName().equals(shardReverseIndexTableName)) {
            shardReverseIndexKeys.add(bik.getKey());
        } else {
            Assert.fail("unknown table: " + bik.getTableName() + " key: " + bik.getKey());
        }
    }
    // Process edges
    countMap.put(edgeTableName, 0);
    if (null != edgeHandler) {
        MyCachingContextWriter contextWriter = new MyCachingContextWriter();
        StandaloneTaskAttemptContext<Text, RawRecordContainerImpl, BulkIngestKey, Value> ctx = new StandaloneTaskAttemptContext<>(((RawRecordContainerImpl) event).getConf(), new StandaloneStatusReporter());
        try {
            contextWriter.setup(ctx.getConfiguration(), false);
            edgeHandler.process(null, event, eventFields, ctx, contextWriter);
            contextWriter.commit(ctx);
            for (Map.Entry<BulkIngestKey, Value> entry : contextWriter.getCache().entries()) {
                if (entry.getKey().getTableName().equals(edgeTableName)) {
                    edgeKeys.add(entry.getKey().getKey());
                }
                if (countMap.containsKey(entry.getKey().getTableName())) {
                    countMap.put(entry.getKey().getTableName(), countMap.get(entry.getKey().getTableName()) + 1);
                } else {
                    countMap.put(entry.getKey().getTableName(), 1);
                }
            }
        } catch (Throwable t) {
            log.error("Error during edge processing", t);
            throw new RuntimeException(t);
        }
    }
    Set<String> keyPrint = new TreeSet<>();
    for (Key k : shardKeys) {
        keyPrint.add("shard key: " + k.getRow() + " ::: " + k.getColumnFamily().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnQualifier().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnVisibility() + " ::: " + k.getTimestamp() + "\n");
    }
    // check index keys
    for (Key k : shardIndexKeys) {
        keyPrint.add("shardIndex key: " + k.getRow() + " ::: " + k.getColumnFamily().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnQualifier().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnVisibility() + " ::: " + k.getTimestamp() + "\n");
    }
    // check reverse index keys
    for (Key k : shardReverseIndexKeys) {
        keyPrint.add("reverseShardIndex key: " + k.getRow() + " ::: " + k.getColumnFamily().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnQualifier().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnVisibility() + " ::: " + k.getTimestamp() + "\n");
    }
    // check edge keys
    for (Key k : edgeKeys) {
        keyPrint.add("edge key: " + k.getRow().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnFamily().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnQualifier().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnVisibility() + " ::: " + k.getTimestamp() + "\n");
    }
    try {
        if (!printKeysOnlyOnFail) {
            for (String keyString : keyPrint) {
                log.info(keyString.trim());
            }
        }
        if (expectedShardKeys > 0)
            Assert.assertEquals((int) countMap.get(shardTableName), expectedShardKeys);
        if (expectedShardIndexKeys > 0)
            Assert.assertEquals((int) countMap.get(shardIndexTableName), expectedShardIndexKeys);
        if (expectedShardReverseIndexKeys > 0)
            Assert.assertEquals((int) countMap.get(shardReverseIndexTableName), expectedShardReverseIndexKeys);
        if (expectedEdgeKeys > 0)
            Assert.assertEquals((int) countMap.get(edgeTableName), expectedEdgeKeys);
    } catch (AssertionError ae) {
        if (printKeysOnlyOnFail) {
            for (String keyString : keyPrint) {
                log.info(keyString.trim());
            }
        }
        Assert.fail(String.format("Expected: %s shard, %s index, %s reverse index, and %s edge keys.\nFound: %s, %s, %s, and %s respectively", expectedShardKeys, expectedShardIndexKeys, expectedShardReverseIndexKeys, expectedEdgeKeys, countMap.get(shardTableName), countMap.get(shardIndexTableName), countMap.get(shardReverseIndexTableName), countMap.get(edgeTableName)));
    }
}
Also used : StandaloneTaskAttemptContext(datawave.ingest.test.StandaloneTaskAttemptContext) RawRecordContainerImpl(datawave.ingest.config.RawRecordContainerImpl) TreeSet(java.util.TreeSet) NormalizedContentInterface(datawave.ingest.data.config.NormalizedContentInterface) HashSet(java.util.HashSet) VirtualIngest(datawave.ingest.data.config.ingest.VirtualIngest) Text(org.apache.hadoop.io.Text) CompositeIngest(datawave.ingest.data.config.ingest.CompositeIngest) Value(org.apache.accumulo.core.data.Value) BulkIngestKey(datawave.ingest.mapreduce.job.BulkIngestKey) StandaloneStatusReporter(datawave.ingest.test.StandaloneStatusReporter) Map(java.util.Map) Key(org.apache.accumulo.core.data.Key) BulkIngestKey(datawave.ingest.mapreduce.job.BulkIngestKey)

Example 2 with CompositeIngest

use of datawave.ingest.data.config.ingest.CompositeIngest in project datawave by NationalSecurityAgency.

the class CompositeIndexTest method setupClass.

@BeforeClass
public static void setupClass() throws Exception {
    System.setProperty("subject.dn.pattern", "(?:^|,)\\s*OU\\s*=\\s*My Department\\s*(?:,|$)");
    setupConfiguration(conf);
    AbstractColumnBasedHandler<Text> dataTypeHandler = new AbstractColumnBasedHandler<>();
    dataTypeHandler.setup(new TaskAttemptContextImpl(conf, new TaskAttemptID()));
    TestIngestHelper ingestHelper = new TestIngestHelper();
    ingestHelper.setup(conf);
    // create and process events with WKT data
    RawRecordContainer record = new RawRecordContainerImpl();
    Multimap<BulkIngestKey, Value> keyValues = HashMultimap.create();
    int recNum = 1;
    for (int dataIdx = 0; dataIdx < 2; dataIdx++) {
        String beginDate;
        String[] wktData;
        Integer[] wktByteLengthData;
        long[] dates;
        boolean useCompositeIngest;
        if (dataIdx == 0) {
            beginDate = LEGACY_BEGIN_DATE;
            wktData = wktLegacyData;
            wktByteLengthData = wktByteLengthLegacyData;
            dates = legacyDates;
            useCompositeIngest = false;
        } else {
            beginDate = COMPOSITE_BEGIN_DATE;
            wktData = wktCompositeData;
            wktByteLengthData = wktByteLengthCompositeData;
            dates = compositeDates;
            useCompositeIngest = true;
        }
        for (int i = 0; i < wktData.length; i++) {
            record.clear();
            record.setDataType(new Type(DATA_TYPE_NAME, TestIngestHelper.class, (Class) null, (String[]) null, 1, (String[]) null));
            record.setRawFileName("geodata_" + recNum + ".dat");
            record.setRawRecordNumber(recNum++);
            record.setDate(formatter.parse(beginDate).getTime() + dates[i]);
            record.setRawData((wktData[i] + "|" + ((wktByteLengthData[i] != null) ? Integer.toString(wktByteLengthData[i]) : "")).getBytes("UTF8"));
            record.generateId(null);
            record.setVisibility(new ColumnVisibility(AUTHS));
            final Multimap<String, NormalizedContentInterface> fields = ingestHelper.getEventFields(record);
            if (useCompositeIngest && ingestHelper instanceof CompositeIngest) {
                Multimap<String, NormalizedContentInterface> compositeFields = ingestHelper.getCompositeFields(fields);
                for (String fieldName : compositeFields.keySet()) {
                    // if this is an overloaded event field, we are replacing the existing data
                    if (ingestHelper.isOverloadedCompositeField(fieldName))
                        fields.removeAll(fieldName);
                    fields.putAll(fieldName, compositeFields.get(fieldName));
                }
            }
            Multimap kvPairs = dataTypeHandler.processBulk(new Text(), record, fields, new MockStatusReporter());
            keyValues.putAll(kvPairs);
            dataTypeHandler.getMetadata().addEvent(ingestHelper, record, fields);
        }
    }
    keyValues.putAll(dataTypeHandler.getMetadata().getBulkMetadata());
    // Write the composite transition date manually
    Key tdKey = new Key(new Text(GEO_FIELD), new Text(ColumnFamilyConstants.COLF_CITD), new Text(DATA_TYPE_NAME + "\0" + COMPOSITE_BEGIN_DATE), new Text(), new SimpleDateFormat(CompositeMetadataHelper.transitionDateFormat).parse(COMPOSITE_BEGIN_DATE).getTime());
    keyValues.put(new BulkIngestKey(new Text(TableName.METADATA), tdKey), new Value());
    // write these values to their respective tables
    instance = new InMemoryInstance();
    Connector connector = instance.getConnector("root", PASSWORD);
    connector.securityOperations().changeUserAuthorizations("root", new Authorizations(AUTHS));
    writeKeyValues(connector, keyValues);
    ivaratorCacheDirConfigs = Collections.singletonList(new IvaratorCacheDirConfig(temporaryFolder.newFolder().toURI().toString()));
}
Also used : Connector(org.apache.accumulo.core.client.Connector) IvaratorCacheDirConfig(datawave.query.iterator.ivarator.IvaratorCacheDirConfig) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) InMemoryInstance(datawave.accumulo.inmemory.InMemoryInstance) RawRecordContainerImpl(datawave.ingest.config.RawRecordContainerImpl) RawRecordContainer(datawave.ingest.data.RawRecordContainer) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) NormalizedContentInterface(datawave.ingest.data.config.NormalizedContentInterface) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility) AbstractColumnBasedHandler(datawave.ingest.mapreduce.handler.shard.AbstractColumnBasedHandler) Authorizations(org.apache.accumulo.core.security.Authorizations) Text(org.apache.hadoop.io.Text) CompositeIngest(datawave.ingest.data.config.ingest.CompositeIngest) MockStatusReporter(datawave.query.testframework.MockStatusReporter) HashMultimap(com.google.common.collect.HashMultimap) Multimap(com.google.common.collect.Multimap) Type(datawave.ingest.data.Type) GeometryType(datawave.data.type.GeometryType) NumberType(datawave.data.type.NumberType) Value(org.apache.accumulo.core.data.Value) NormalizedFieldAndValue(datawave.ingest.data.config.NormalizedFieldAndValue) BulkIngestKey(datawave.ingest.mapreduce.job.BulkIngestKey) BeforeClass(org.junit.BeforeClass) SimpleDateFormat(java.text.SimpleDateFormat) BulkIngestKey(datawave.ingest.mapreduce.job.BulkIngestKey) Key(org.apache.accumulo.core.data.Key) BeforeClass(org.junit.BeforeClass)

Example 3 with CompositeIngest

use of datawave.ingest.data.config.ingest.CompositeIngest in project datawave by NationalSecurityAgency.

the class EventMapper method getFields.

public Multimap<String, NormalizedContentInterface> getFields(RawRecordContainer value, DataTypeHandler<K1> handler) throws Exception {
    Multimap<String, NormalizedContentInterface> newFields;
    // Parse the event into its field names and field values using the DataTypeHandler's BaseIngestHelper object.
    newFields = handler.getHelper(value.getDataType()).getEventFields(value);
    // Also get the virtual fields, if applicable.
    if (handler.getHelper(value.getDataType()) instanceof VirtualIngest) {
        VirtualIngest vHelper = (VirtualIngest) handler.getHelper(value.getDataType());
        Multimap<String, NormalizedContentInterface> virtualFields = vHelper.getVirtualFields(newFields);
        for (Entry<String, NormalizedContentInterface> v : virtualFields.entries()) newFields.put(v.getKey(), v.getValue());
    }
    // Also get the composite fields, if applicable
    if (handler.getHelper(value.getDataType()) instanceof CompositeIngest) {
        CompositeIngest vHelper = (CompositeIngest) handler.getHelper(value.getDataType());
        Multimap<String, NormalizedContentInterface> compositeFields = vHelper.getCompositeFields(newFields);
        for (String fieldName : compositeFields.keySet()) {
            // if this is an overloaded composite field, we are replacing the existing field data
            if (vHelper.isOverloadedCompositeField(fieldName))
                newFields.removeAll(fieldName);
            newFields.putAll(fieldName, compositeFields.get(fieldName));
        }
    }
    // Create a LOAD_DATE parameter, which is the current time in milliseconds, for all datatypes
    long loadDate = now.get();
    NormalizedFieldAndValue loadDateValue = new NormalizedFieldAndValue(LOAD_DATE_FIELDNAME, Long.toString(loadDate));
    // set an indexed field value for use by the date index data type handler
    loadDateValue.setIndexedFieldValue(dateNormalizer.normalizeDelegateType(new Date(loadDate)));
    newFields.put(LOAD_DATE_FIELDNAME, loadDateValue);
    String seqFileName = null;
    // place the sequence filename into the event
    if (createSequenceFileName) {
        seqFileName = NDC.peek();
        if (trimSequenceFileName) {
            seqFileName = StringUtils.substringAfterLast(seqFileName, "/");
        }
        if (null != seqFileName) {
            StringBuilder seqFile = new StringBuilder(seqFileName);
            seqFile.append(SRC_FILE_DEL).append(offset);
            if (null != splitStart) {
                seqFile.append(SRC_FILE_DEL).append(splitStart);
            }
            newFields.put(SEQUENCE_FILE_FIELDNAME, new NormalizedFieldAndValue(SEQUENCE_FILE_FIELDNAME, seqFile.toString()));
        }
    }
    if (createRawFileName && !value.getRawFileName().isEmpty() && !value.getRawFileName().equals(seqFileName)) {
        newFields.put(RAW_FILE_FIELDNAME, new NormalizedFieldAndValue(RAW_FILE_FIELDNAME, value.getRawFileName()));
    }
    // Also if this helper needs to filter the fields before returning, apply now
    if (handler.getHelper(value.getDataType()) instanceof FilterIngest) {
        FilterIngest fHelper = (FilterIngest) handler.getHelper(value.getDataType());
        fHelper.filter(newFields);
    }
    return newFields;
}
Also used : VirtualIngest(datawave.ingest.data.config.ingest.VirtualIngest) FilterIngest(datawave.ingest.data.config.ingest.FilterIngest) NormalizedContentInterface(datawave.ingest.data.config.NormalizedContentInterface) CompositeIngest(datawave.ingest.data.config.ingest.CompositeIngest) NormalizedFieldAndValue(datawave.ingest.data.config.NormalizedFieldAndValue) Date(java.util.Date)

Aggregations

NormalizedContentInterface (datawave.ingest.data.config.NormalizedContentInterface)3 CompositeIngest (datawave.ingest.data.config.ingest.CompositeIngest)3 RawRecordContainerImpl (datawave.ingest.config.RawRecordContainerImpl)2 NormalizedFieldAndValue (datawave.ingest.data.config.NormalizedFieldAndValue)2 VirtualIngest (datawave.ingest.data.config.ingest.VirtualIngest)2 BulkIngestKey (datawave.ingest.mapreduce.job.BulkIngestKey)2 Key (org.apache.accumulo.core.data.Key)2 Value (org.apache.accumulo.core.data.Value)2 Text (org.apache.hadoop.io.Text)2 HashMultimap (com.google.common.collect.HashMultimap)1 Multimap (com.google.common.collect.Multimap)1 InMemoryInstance (datawave.accumulo.inmemory.InMemoryInstance)1 GeometryType (datawave.data.type.GeometryType)1 NumberType (datawave.data.type.NumberType)1 RawRecordContainer (datawave.ingest.data.RawRecordContainer)1 Type (datawave.ingest.data.Type)1 FilterIngest (datawave.ingest.data.config.ingest.FilterIngest)1 AbstractColumnBasedHandler (datawave.ingest.mapreduce.handler.shard.AbstractColumnBasedHandler)1 StandaloneStatusReporter (datawave.ingest.test.StandaloneStatusReporter)1 StandaloneTaskAttemptContext (datawave.ingest.test.StandaloneTaskAttemptContext)1