Search in sources :

Example 11 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class JoinDistancePredicate method generateIntersectionSchema.

/**
     * Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
     * The attributes have to be exactly the same (name and type) to be intersected.
     * 
     * InnerOperator's attributes and outerOperator's attributes must:
     * both contain the attributes to be joined.
     * both contain "_ID" attribute.
     * both contain "spanList" attribute.
     * 
     * @return outputSchema
     */
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataFlowException {
    List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
    List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
    List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
    Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
    // check if output schema contain necessary attributes
    if (intersectionSchema.getAttributes().isEmpty()) {
        throw new DataFlowException("inner operator and outer operator don't share any common attributes");
    } else if (intersectionSchema.getAttribute(this.joinAttributeName) == null) {
        throw new DataFlowException("inner operator or outer operator doesn't contain join attribute");
    } else if (intersectionSchema.getAttribute(SchemaConstants._ID) == null) {
        throw new DataFlowException("inner operator or outer operator doesn't contain _ID attribute");
    } else if (intersectionSchema.getAttribute(SchemaConstants.SPAN_LIST) == null) {
        throw new DataFlowException("inner operator or outer operator doesn't contain spanList attribute");
    }
    // check if join attribute is TEXT or STRING
    AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getAttributeType();
    if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
        throw new DataFlowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
    }
    return intersectionSchema;
}
Also used : SchemaConstants(edu.uci.ics.textdb.api.constants.SchemaConstants) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) Attribute(edu.uci.ics.textdb.api.schema.Attribute) Iterator(java.util.Iterator) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) PredicateBase(edu.uci.ics.textdb.exp.common.PredicateBase) Schema(edu.uci.ics.textdb.api.schema.Schema) List(java.util.List) ListField(edu.uci.ics.textdb.api.field.ListField) IField(edu.uci.ics.textdb.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) edu.uci.ics.textdb.api.tuple(edu.uci.ics.textdb.api.tuple) Span(edu.uci.ics.textdb.api.span.Span) PropertyNameConstants(edu.uci.ics.textdb.exp.common.PropertyNameConstants) IOperator(edu.uci.ics.textdb.api.dataflow.IOperator) Attribute(edu.uci.ics.textdb.api.schema.Attribute) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) Schema(edu.uci.ics.textdb.api.schema.Schema) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException)

Example 12 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcher method computeSubstringMatchingResult.

private List<Span> computeSubstringMatchingResult(Tuple inputTuple) throws DataFlowException {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        // for STRING type, the query should match the fieldValue completely
        if (attributeType == AttributeType.STRING) {
            if (fieldValue.equals(predicate.getQuery())) {
                matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
            }
        }
        if (attributeType == AttributeType.TEXT) {
            String regex = predicate.getQuery().toLowerCase();
            Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(fieldValue.toLowerCase());
            while (matcher.find()) {
                int start = matcher.start();
                int end = matcher.end();
                matchingResults.add(new Span(attributeName, start, end, predicate.getQuery(), fieldValue.substring(start, end)));
            }
        }
    }
    return matchingResults;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Span(edu.uci.ics.textdb.api.span.Span)

Example 13 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class KeywordMatcherSourceOperator method buildPhraseQuery.

private Query buildPhraseQuery() throws DataFlowException {
    BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataFlowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
        }
        if (attributeType == AttributeType.STRING) {
            Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
            booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
        }
        if (attributeType == AttributeType.TEXT) {
            if (queryTokenList.size() == 1) {
                Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
                booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
            } else {
                PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
                for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                    if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
                        phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
                    }
                }
                PhraseQuery phraseQuery = phraseQueryBuilder.build();
                booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
            }
        }
    }
    return booleanQueryBuilder.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) Term(org.apache.lucene.index.Term)

Example 14 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class RegexSplitOperator method populateOutputBuffer.

// If the regex does not have any match in the tuple, we return the whole string as the result.
private void populateOutputBuffer(Tuple inputTuple) throws TextDBException {
    if (inputTuple == null) {
        return;
    }
    AttributeType attributeType = this.inputSchema.getAttribute(predicate.getAttributeToSplit()).getAttributeType();
    if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
        return;
    }
    String strToSplit = inputTuple.getField(predicate.getAttributeToSplit()).getValue().toString();
    List<String> stringList = splitText(strToSplit);
    outputTupleBuffer = new ArrayList<>();
    for (String singleMatch : stringList) {
        List<IField> tupleFieldList = new ArrayList<>();
        // Generate the new UUID.
        tupleFieldList.add(IDField.newRandomID());
        for (String attributeName : inputSchema.getAttributeNames()) {
            // Remove the old ID.
            if (attributeName.equals(SchemaConstants._ID)) {
                continue;
            }
            if (attributeName.equals(predicate.getAttributeToSplit())) {
                if (attributeType == AttributeType.TEXT) {
                    tupleFieldList.add(new TextField(singleMatch));
                } else {
                    tupleFieldList.add(new StringField(singleMatch));
                }
            } else {
                tupleFieldList.add(inputTuple.getField(attributeName));
            }
        }
        outputTupleBuffer.add(new Tuple(outputSchema, tupleFieldList.stream().toArray(IField[]::new)));
    }
}
Also used : AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) StringField(edu.uci.ics.textdb.api.field.StringField) ArrayList(java.util.ArrayList) TextField(edu.uci.ics.textdb.api.field.TextField) IField(edu.uci.ics.textdb.api.field.IField) Tuple(edu.uci.ics.textdb.api.tuple.Tuple)

Example 15 with AttributeType

use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.

the class DataReader method documentToFields.

private ArrayList<IField> documentToFields(Document luceneDocument) throws ParseException {
    ArrayList<IField> fields = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        AttributeType attributeType = attr.getAttributeType();
        String fieldValue = luceneDocument.get(attr.getAttributeName());
        fields.add(StorageUtils.getField(attributeType, fieldValue));
    }
    return fields;
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) IField(edu.uci.ics.textdb.api.field.IField)

Aggregations

AttributeType (edu.uci.ics.textdb.api.schema.AttributeType)17 ArrayList (java.util.ArrayList)11 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)10 Attribute (edu.uci.ics.textdb.api.schema.Attribute)8 Schema (edu.uci.ics.textdb.api.schema.Schema)8 Span (edu.uci.ics.textdb.api.span.Span)8 IField (edu.uci.ics.textdb.api.field.IField)5 SchemaConstants (edu.uci.ics.textdb.api.constants.SchemaConstants)4 ListField (edu.uci.ics.textdb.api.field.ListField)4 Tuple (edu.uci.ics.textdb.api.tuple.Tuple)4 Iterator (java.util.Iterator)4 List (java.util.List)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 Collectors (java.util.stream.Collectors)4 ErrorMessages (edu.uci.ics.textdb.api.constants.ErrorMessages)3 TextDBException (edu.uci.ics.textdb.api.exception.TextDBException)3 Utils (edu.uci.ics.textdb.api.utils.Utils)3 AbstractSingleInputOperator (edu.uci.ics.textdb.exp.common.AbstractSingleInputOperator)3 DataflowUtils (edu.uci.ics.textdb.exp.utils.DataflowUtils)3