use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class JoinDistancePredicate method generateIntersectionSchema.
/**
* Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
* The attributes have to be exactly the same (name and type) to be intersected.
*
* InnerOperator's attributes and outerOperator's attributes must:
* both contain the attributes to be joined.
* both contain "_ID" attribute.
* both contain "spanList" attribute.
*
* @return outputSchema
*/
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataflowException {
List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
// check if output schema contain necessary attributes
if (intersectionSchema.getAttributes().isEmpty()) {
throw new DataflowException("inner operator and outer operator don't share any common attributes");
} else if (!intersectionSchema.containsAttribute(this.joinAttributeName)) {
throw new DataflowException("inner operator or outer operator doesn't contain join attribute");
} else if (!intersectionSchema.containsAttribute(SchemaConstants._ID)) {
throw new DataflowException("inner operator or outer operator doesn't contain _ID attribute");
} else if (!intersectionSchema.containsAttribute(SchemaConstants.SPAN_LIST)) {
throw new DataflowException("inner operator or outer operator doesn't contain spanList attribute");
}
// check if join attribute is TEXT or STRING
AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getType();
if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
throw new DataflowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
}
return intersectionSchema;
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class DictionaryMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
if (inputTuple == null) {
return null;
}
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getAnalyzerString())));
inputTuple = tupleBuilderPayload.build();
}
List<Span> matchingResults = null;
if (predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
matchingResults = appendConjunctionMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenSetsNoStopwords, dictionaryEntries);
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED) {
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
ArrayList<List<String>> tokenListsNoStopwords = predicate.getDictionary().getTokenListsNoStopwords();
ArrayList<List<String>> tokenListsWithStopwords = predicate.getDictionary().getTokenListsWithStopwords();
ArrayList<Set<String>> tokenSetsNoStopwords = predicate.getDictionary().getTokenSetsNoStopwords();
matchingResults = appendPhraseMatchingSpans4Dictionary(inputTuple, predicate.getAttributeNames(), tokenListsNoStopwords, tokenSetsNoStopwords, tokenListsWithStopwords, dictionaryEntries);
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.SUBSTRING_SCANBASED) {
matchingResults = new ArrayList<Span>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
List<ACTrie.Emit> matchingEmits = dictionaryTrie.parseText(fieldValue);
if (!matchingEmits.isEmpty()) {
for (ACTrie.Emit emit : matchingEmits) {
matchingResults.add(new Span(attributeName, emit.getStart(), emit.getEnd(), emit.getKeyword(), fieldValue.substring(emit.getStart(), emit.getEnd())));
}
}
}
} else if (predicate.getKeywordMatchingType() == KeywordMatchingType.REGEX) {
ArrayList<Pattern> patternList = predicate.getDictionary().getPatternList();
ArrayList<String> dictionaryEntries = predicate.getDictionary().getDictionaryEntries();
matchingResults = new ArrayList<>();
for (int i = 0; i < dictionaryEntries.size(); i++) {
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
Matcher javaMatcher = patternList.get(i).matcher(fieldValue);
while (javaMatcher.find()) {
int start = javaMatcher.start();
int end = javaMatcher.end();
matchingResults.add(new Span(attributeName, start, end, dictionaryEntries.get(i), fieldValue.substring(start, end)));
}
}
}
}
if (matchingResults.isEmpty()) {
return null;
}
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
}
return tupleBuilder.build();
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class DictionaryMatcher method appendPhraseMatchingSpans4Dictionary.
public List<Span> appendPhraseMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<List<String>> queryTokenList, List<Set<String>> queryTokenSetList, List<List<String>> queryTokenListWithStopwords, List<String> queryList) throws DataflowException {
List<Span> matchingResults = new ArrayList<>();
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryList.contains(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// for TEXT type, spans need to be reconstructed according to the phrase query.
if (attributeType == AttributeType.TEXT) {
for (int index : relevantSpansMap.keySet()) {
List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (fieldSpanList.isEmpty() || !DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
continue;
}
matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryList.get(index), fieldSpanList, queryTokenListWithStopwords.get(index), queryTokenList.get(index)));
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class NltkSentimentOperator method transformToOutputSchema.
public Schema transformToOutputSchema(Schema... inputSchema) {
if (inputSchema.length != 1)
throw new TexeraException(String.format(ErrorMessages.NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH, 1, inputSchema.length));
// check if the input schema is presented
if (!inputSchema[0].containsAttribute(predicate.getInputAttributeName())) {
throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema[0].getAttributeNames()));
}
// check if the attribute type is valid
AttributeType inputAttributeType = inputSchema[0].getAttribute(predicate.getInputAttributeName()).getType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
}
return transformSchema(inputSchema[0]);
}
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class NltkSentimentOperator method convertToTexeraSchema.
private Schema convertToTexeraSchema(org.apache.arrow.vector.types.pojo.Schema arrowSchema) {
List<Attribute> texeraAttributes = new ArrayList<>();
for (Field f : arrowSchema.getFields()) {
String attributeName = f.getName();
AttributeType attributeType;
ArrowType arrowType = f.getFieldType().getType();
switch(arrowType.getTypeID()) {
case Int:
attributeType = INTEGER;
break;
case FloatingPoint:
attributeType = DOUBLE;
break;
case Bool:
attributeType = BOOLEAN;
break;
case Utf8:
case Null:
attributeType = TEXT;
break;
case Date:
attributeType = DATE;
break;
case Struct:
// For now only Struct of DateTime
attributeType = DATETIME;
break;
case List:
attributeType = LIST;
break;
default:
throw (new DataflowException("Unsupported data type " + arrowType.getTypeID() + " when converting back to Texera table."));
}
texeraAttributes.add(new Attribute(attributeName, attributeType));
}
return new Schema(texeraAttributes);
}
Aggregations