Search in sources :

Example 21 with Schema

use of edu.uci.ics.texera.api.schema.Schema in project textdb by TextDB.

the class Join method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    if (innerOperator == null) {
        throw new DataflowException("Inner Input Operator is not set.");
    }
    if (outerOperator == null) {
        throw new DataflowException("Outer Input Operator is not set.");
    }
    // generate output schema from schema of inner and outer operator
    innerOperator.open();
    Schema innerOperatorSchema = innerOperator.getOutputSchema();
    outerOperator.open();
    Schema outerOperatorSchema = outerOperator.getOutputSchema();
    this.outputSchema = joinPredicate.generateOutputSchema(innerOperatorSchema, outerOperatorSchema);
    cursor = OPENED;
}
Also used : Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 22 with Schema

use of edu.uci.ics.texera.api.schema.Schema in project textdb by TextDB.

the class JoinDistancePredicate method generateIntersectionSchema.

/**
 * Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
 * The attributes have to be exactly the same (name and type) to be intersected.
 *
 * InnerOperator's attributes and outerOperator's attributes must:
 * both contain the attributes to be joined.
 * both contain "_ID" attribute.
 * both contain "spanList" attribute.
 *
 * @return outputSchema
 */
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataflowException {
    List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
    List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
    List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
    Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
    // check if output schema contain necessary attributes
    if (intersectionSchema.getAttributes().isEmpty()) {
        throw new DataflowException("inner operator and outer operator don't share any common attributes");
    } else if (!intersectionSchema.containsAttribute(this.joinAttributeName)) {
        throw new DataflowException("inner operator or outer operator doesn't contain join attribute");
    } else if (!intersectionSchema.containsAttribute(SchemaConstants._ID)) {
        throw new DataflowException("inner operator or outer operator doesn't contain _ID attribute");
    } else if (!intersectionSchema.containsAttribute(SchemaConstants.SPAN_LIST)) {
        throw new DataflowException("inner operator or outer operator doesn't contain spanList attribute");
    }
    // check if join attribute is TEXT or STRING
    AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getType();
    if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
        throw new DataflowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
    }
    return intersectionSchema;
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) Iterator(java.util.Iterator) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) ArrayList(java.util.ArrayList) List(java.util.List) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 23 with Schema

use of edu.uci.ics.texera.api.schema.Schema in project textdb by TextDB.

the class SimilarityJoinPredicate method joinTuples.

@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataflowException {
    if (similarityThreshold == 0) {
        return null;
    }
    // get the span list only with the joinAttributeName
    ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
    ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
    List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
    // get a set of span's values (since multiple spans may have the same value)
    Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
    Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
    // compute the result value set using the similarity function
    Set<String> resultValueSet = new HashSet<>();
    for (String innerString : innerSpanValueSet) {
        for (String outerString : outerSpanValueSet) {
            if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
                resultValueSet.add(innerString);
                resultValueSet.add(outerString);
            }
        }
    }
    // return null if none of them are similar
    if (resultValueSet.isEmpty()) {
        return null;
    }
    // generate the result spans
    List<Span> resultSpans = new ArrayList<>();
    for (Span span : innerRelevantSpanList) {
        if (resultValueSet.contains(span.getValue())) {
            resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
        }
    }
    for (Span span : outerRelevantSpanList) {
        if (resultValueSet.contains(span.getValue())) {
            resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
        }
    }
    return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) ListField(edu.uci.ics.texera.api.field.ListField) java.util(java.util) edu.uci.ics.texera.api.tuple(edu.uci.ics.texera.api.tuple) PropertyNameConstants(edu.uci.ics.texera.dataflow.common.PropertyNameConstants) PredicateBase(edu.uci.ics.texera.dataflow.common.PredicateBase) Collectors(java.util.stream.Collectors) Span(edu.uci.ics.texera.api.span.Span) SchemaConstants(edu.uci.ics.texera.api.constants.SchemaConstants) IOperator(edu.uci.ics.texera.api.dataflow.IOperator) IField(edu.uci.ics.texera.api.field.IField) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) Attribute(edu.uci.ics.texera.api.schema.Attribute) IDField(edu.uci.ics.texera.api.field.IDField) NormalizedLevenshtein(info.debatty.java.stringsimilarity.NormalizedLevenshtein) Span(edu.uci.ics.texera.api.span.Span)

Example 24 with Schema

use of edu.uci.ics.texera.api.schema.Schema in project textdb by TextDB.

the class EmojiSentimentOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    if (inputOperator == null) {
        throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    // check if input schema is present
    if (!inputSchema.containsAttribute(predicate.getInputAttributeName())) {
        throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
    }
    // check if attribute type is valid
    AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    // generate output schema by transforming the input schema
    outputSchema = transformSchema(inputOperator.getOutputSchema());
    cursor = OPENED;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Example 25 with Schema

use of edu.uci.ics.texera.api.schema.Schema in project textdb by TextDB.

the class NltkSentimentOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    if (inputOperator == null) {
        throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    // check if the input schema is presented
    if (!inputSchema.containsAttribute(predicate.getInputAttributeName())) {
        throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
    }
    // check if the attribute type is valid
    AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    // generate output schema by transforming the input schema
    outputSchema = transformSchema(inputOperator.getOutputSchema());
    cursor = OPENED;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) TexeraException(edu.uci.ics.texera.api.exception.TexeraException)

Aggregations

Schema (edu.uci.ics.texera.api.schema.Schema)134 Test (org.junit.Test)109 Tuple (edu.uci.ics.texera.api.tuple.Tuple)106 ArrayList (java.util.ArrayList)97 IField (edu.uci.ics.texera.api.field.IField)96 Span (edu.uci.ics.texera.api.span.Span)86 TextField (edu.uci.ics.texera.api.field.TextField)77 Attribute (edu.uci.ics.texera.api.schema.Attribute)76 StringField (edu.uci.ics.texera.api.field.StringField)72 IntegerField (edu.uci.ics.texera.api.field.IntegerField)71 DoubleField (edu.uci.ics.texera.api.field.DoubleField)60 DateField (edu.uci.ics.texera.api.field.DateField)57 SimpleDateFormat (java.text.SimpleDateFormat)54 Dictionary (edu.uci.ics.texera.dataflow.dictionarymatcher.Dictionary)29 ListField (edu.uci.ics.texera.api.field.ListField)21 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)15 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)14 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)13 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)9 JoinDistancePredicate (edu.uci.ics.texera.dataflow.join.JoinDistancePredicate)9