Search in sources :

Example 16 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class DataWriter method getLuceneDocument.

/*
     * Converts a TextDB tuple to a Lucene document
     */
private static Document getLuceneDocument(Tuple tuple) {
    List<IField> fields = tuple.getFields();
    List<Attribute> attributes = tuple.getSchema().getAttributes();
    Document doc = new Document();
    for (int count = 0; count < fields.size(); count++) {
        IField field = fields.get(count);
        Attribute attr = attributes.get(count);
        AttributeType attributeType = attr.getAttributeType();
        doc.add(StorageUtils.getLuceneField(attributeType, attr.getAttributeName(), field.getValue()));
    }
    return doc;
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) IField(edu.uci.ics.textdb.api.field.IField) Document(org.apache.lucene.document.Document)

Example 17 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class DictionaryMatcherSourceOperator method computeMatchingResult.

/*
     * Match the key against the Tuple. if there's no match, returns the
     * original Tuple object, if there's a match, return a new Tuple
     * with span list added
     */
private Tuple computeMatchingResult(String key, Tuple sourceTuple) throws TextDBException {
    List<String> attributeNames = predicate.getAttributeNames();
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : attributeNames) {
        String fieldValue = sourceTuple.getField(attributeName).getValue().toString();
        AttributeType attributeType = inputSchema.getAttribute(attributeName).getAttributeType();
        // fieldValue exactly
        if (attributeType != AttributeType.TEXT) {
            if (fieldValue.equals(key)) {
                matchingResults.add(new Span(attributeName, 0, fieldValue.length(), key, fieldValue));
            }
        } else // if attribute type is TEXT, then key can match a substring of
        // fieldValue
        {
            String regex = key.toLowerCase();
            Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(fieldValue.toLowerCase());
            while (matcher.find()) {
                int start = matcher.start();
                int end = matcher.end();
                matchingResults.add(new Span(attributeName, start, end, key, fieldValue.substring(start, end)));
            }
        }
    }
    advanceDictionaryCursor();
    if (matchingResults.size() == 0) {
        return null;
    }
    ListField<Span> spanListField = sourceTuple.getField(predicate.getSpanListName());
    List<Span> spanList = spanListField.getValue();
    spanList.addAll(matchingResults);
    return sourceTuple;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) Span(edu.uci.ics.textdb.api.span.Span)

Aggregations

AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)17 ArrayList (java.util.ArrayList)11 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)10 Attribute (edu.uci.ics.textdb.api.schema.Attribute)8 Schema (edu.uci.ics.textdb.api.schema.Schema)8 Span (edu.uci.ics.textdb.api.span.Span)8 IField (edu.uci.ics.textdb.api.field.IField)5 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)4 ListField (edu.uci.ics.textdb.api.field.ListField)4 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)4 Iterator (java.util.Iterator)4 List (java.util.List)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 Collectors (java.util.stream.Collectors)4 ErrorMessages (edu.uci.ics.textdb.api.constants.ErrorMessages)3 TextDBException (edu.uci.ics.textdb.api.exception.TextDBException)3 Utils (edu.uci.ics.textdb.api.utils.Utils)3 AbstractSingleInputOperator (edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator)3 DataflowUtils (edu.uci.ics.textdb.exp.utils.DataflowUtils)3