use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class NlpSplitOperator method open.
@Override
public void open() throws TextDBException {
if (cursor != CLOSED) {
return;
}
if (inputOperator == null) {
throw new DataFlowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
}
inputOperator.open();
Schema inputSchema = inputOperator.getOutputSchema();
// check if input schema is present
if (!inputSchema.containsField(predicate.getInputAttributeName())) {
throw new DataFlowException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
}
// check if attribute type is valid
AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getAttributeType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new DataFlowException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
}
// generate output schema by transforming the input schema based on what output format
// is chosen (OneToOne vs. OneToMany)
outputSchema = transformSchema(inputOperator.getOutputSchema());
cursor = OPENED;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class RegexMatcher method processOneInputTuple.
/**
* This function returns a list of spans in the given tuple that match the
* regex For example, given tuple ("george watson", "graduate student", 23,
* "(949)888-8888") and regex "g[^\s]*", this function will return
* [Span(name, 0, 6, "g[^\s]*", "george watson"), Span(position, 0, 8,
* "g[^\s]*", "graduate student")]
*
* @param tuple
* document in which search is performed
* @return a list of spans describing the occurrence of a matching sequence
* in the document
* @throws DataFlowException
*/
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws DataFlowException {
if (inputTuple == null) {
return null;
}
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
switch(regexEngine) {
case JavaRegex:
matchingResults.addAll(javaRegexMatch(fieldValue, attributeName));
break;
case RE2J:
matchingResults.addAll(re2jRegexMatch(fieldValue, attributeName));
break;
}
}
if (matchingResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchingResults);
return inputTuple;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class SimilarityJoinPredicate method generateOutputSchema.
@Override
public Schema generateOutputSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataFlowException {
List<Attribute> outputAttributeList = new ArrayList<>();
// add _ID field first
outputAttributeList.add(SchemaConstants._ID_ATTRIBUTE);
for (Attribute attr : innerOperatorSchema.getAttributes()) {
String attrName = attr.getAttributeName();
AttributeType attrType = attr.getAttributeType();
// ignore _id, spanList, and payload
if (attrName.equals(SchemaConstants._ID) || attrName.equals(SchemaConstants.SPAN_LIST) || attrName.equals(SchemaConstants.PAYLOAD)) {
continue;
}
outputAttributeList.add(new Attribute(INNER_PREFIX + attrName, attrType));
}
for (Attribute attr : outerOperatorSchema.getAttributes()) {
String attrName = attr.getAttributeName();
AttributeType attrType = attr.getAttributeType();
// ignore _id, spanList, and payload
if (attrName.equals(SchemaConstants._ID) || attrName.equals(SchemaConstants.SPAN_LIST) || attrName.equals(SchemaConstants.PAYLOAD)) {
continue;
}
outputAttributeList.add(new Attribute(OUTER_PREFIX + attrName, attrType));
}
// add spanList field
outputAttributeList.add(SchemaConstants.SPAN_LIST_ATTRIBUTE);
// add payload field if one of them contains payload
if (innerOperatorSchema.containsField(SchemaConstants.PAYLOAD) || outerOperatorSchema.containsField(SchemaConstants.PAYLOAD)) {
outputAttributeList.add(SchemaConstants.PAYLOAD_ATTRIBUTE);
}
return new Schema(outputAttributeList.stream().toArray(Attribute[]::new));
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcher method computeConjunctionMatchingResult.
private List<Span> computeConjunctionMatchingResult(Tuple inputTuple) throws DataFlowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
Span span = new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue);
matchingResults.add(span);
}
}
// list for this field
if (attributeType == AttributeType.TEXT) {
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
matchingResults.addAll(fieldSpanList);
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcher method computePhraseMatchingResult.
private List<Span> computePhraseMatchingResult(Tuple inputTuple) throws DataFlowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
// Sort current field's span list by token offset for later use
Collections.sort(fieldSpanList, (span1, span2) -> span1.getTokenOffset() - span2.getTokenOffset());
List<Integer> queryTokenOffset = new ArrayList<>();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (queryTokenList.contains(queryTokensWithStopwords.get(i))) {
queryTokenOffset.add(i);
}
}
// maintains position of term being checked in
int iter = 0;
// spanForThisField list
while (iter < fieldSpanList.size()) {
if (iter > fieldSpanList.size() - queryTokenList.size()) {
break;
}
// Verify if span in the spanForThisField correspond to our
// phrase query, ie relative position offsets should be
// similar
// and the value should be same.
// flag to check if a
boolean isMismatchInSpan = false;
// To check all the terms in query are verified
for (int i = 0; i < queryTokenList.size() - 1; i++) {
Span first = fieldSpanList.get(iter + i);
Span second = fieldSpanList.get(iter + i + 1);
if (!(second.getTokenOffset() - first.getTokenOffset() == queryTokenOffset.get(i + 1) - queryTokenOffset.get(i) && first.getValue().equalsIgnoreCase(queryTokenList.get(i)) && second.getValue().equalsIgnoreCase(queryTokenList.get(i + 1)))) {
iter++;
isMismatchInSpan = true;
break;
}
}
if (isMismatchInSpan) {
continue;
}
int combinedSpanStartIndex = fieldSpanList.get(iter).getStart();
int combinedSpanEndIndex = fieldSpanList.get(iter + queryTokenList.size() - 1).getEnd();
Span combinedSpan = new Span(attributeName, combinedSpanStartIndex, combinedSpanEndIndex, predicate.getQuery(), fieldValue.substring(combinedSpanStartIndex, combinedSpanEndIndex));
matchingResults.add(combinedSpan);
iter = iter + queryTokenList.size();
}
}
}
return matchingResults;
}
Aggregations