use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class DictionaryMatcherSourceOperator method getNextTuple.
@Override
public Tuple getNextTuple() throws TexeraException {
if (cursor >= limit + offset) {
return null;
}
if (predicate.getKeywordMatchingType() == KeywordMatchingType.PHRASE_INDEXBASED || predicate.getKeywordMatchingType() == KeywordMatchingType.CONJUNCTION_INDEXBASED) {
// For each dictionary entry, get all results from KeywordMatcher.
if (!resultMapPopulated) {
computeMatchingResults();
resultIterator = tupleIDMap.keySet().iterator();
resultMapPopulated = true;
}
while (true) {
if (resultIterator.hasNext()) {
cursor++;
String tupleID = resultIterator.next();
Tuple resultTuple = new Tuple.Builder(tupleIDMap.get(tupleID)).add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(tupleResultMap.get(tupleID))).build();
if (cursor > offset) {
return resultTuple;
}
continue;
} else {
return null;
}
}
} else // Substring matching or regex matching (scan based)
{
while (true) {
Tuple inputTuple;
if ((inputTuple = dictionaryMatcher.getNextTuple()) != null) {
cursor++;
if (cursor > offset) {
return inputTuple;
}
continue;
} else {
return null;
}
}
}
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class JoinDistancePredicate method joinTuples.
/**
* This method is called by the Join operator to perform the join on the
* tuples passed.
*
* @return New Tuple containing the result of join operation.
*/
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws Exception {
List<Span> newJoinSpanList = new ArrayList<>();
/*
* We expect the values of all fields to be the same for innerTuple and outerTuple.
* We only checks _ID field, and field to be joined, since they are crucial to join operator.
* For other fields, we use the value from innerTuple.
* check if the _ID fields are the same
*/
if (!compareField(innerTuple, outerTuple, SchemaConstants._ID)) {
return null;
}
// check if the fields to be joined are the same
if (!compareField(innerTuple, outerTuple, this.joinAttributeName)) {
return null;
}
/*
* If either/both tuples have no span information, return null.
* Check using try/catch if both the tuples have span information.
* If not return null; so we can process next tuple.
*/
ListField<Span> spanFieldOfInnerTuple = innerTuple.getField(SchemaConstants.SPAN_LIST);
ListField<Span> spanFieldOfOuterTuple = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerSpanList = null;
List<Span> outerSpanList = null;
// ListField
if (spanFieldOfInnerTuple.getClass().equals(ListField.class)) {
innerSpanList = spanFieldOfInnerTuple.getValue();
}
if (spanFieldOfOuterTuple.getClass().equals(ListField.class)) {
outerSpanList = spanFieldOfOuterTuple.getValue();
}
Iterator<Span> outerSpanIter = outerSpanList.iterator();
// the ones specified in the JoinPredicate during "sort merge"?)
while (outerSpanIter.hasNext()) {
Span outerSpan = outerSpanIter.next();
// If not return null.
if (!outerSpan.getAttributeName().equals(this.joinAttributeName)) {
continue;
}
Iterator<Span> innerSpanIter = innerSpanList.iterator();
while (innerSpanIter.hasNext()) {
Span innerSpan = innerSpanIter.next();
if (!innerSpan.getAttributeName().equals(this.joinAttributeName)) {
continue;
}
Integer threshold = this.getThreshold();
if (Math.abs(outerSpan.getStart() - innerSpan.getStart()) <= threshold && Math.abs(outerSpan.getEnd() - innerSpan.getEnd()) <= threshold) {
Integer newSpanStartIndex = Math.min(innerSpan.getStart(), outerSpan.getStart());
Integer newSpanEndIndex = Math.max(innerSpan.getEnd(), outerSpan.getEnd());
String attributeName = this.joinAttributeName;
String fieldValue = (String) innerTuple.getField(attributeName).getValue();
String newFieldValue = fieldValue.substring(newSpanStartIndex, newSpanEndIndex);
String spanKey = outerSpan.getKey() + "_" + innerSpan.getKey();
Span newSpan = new Span(attributeName, newSpanStartIndex, newSpanEndIndex, spanKey, newFieldValue);
newJoinSpanList.add(newSpan);
}
}
}
if (newJoinSpanList.isEmpty()) {
return null;
}
// create output fields based on innerTuple's value
List<Attribute> outputAttrList = outputSchema.getAttributes();
List<IField> outputFields = outputAttrList.stream().filter(attr -> !attr.equals(SchemaConstants.SPAN_LIST_ATTRIBUTE)).map(attr -> attr.getName()).map(attributeName -> innerTuple.getField(attributeName, IField.class)).collect(Collectors.toList());
outputFields.add(new ListField<>(newJoinSpanList));
return new Tuple(outputSchema, outputFields.stream().toArray(IField[]::new));
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class DictionaryMatcher method appendConjunctionMatchingSpans4Dictionary.
private List<Span> appendConjunctionMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<Set<String>> queryTokenSetList, List<String> queryList) throws DataflowException {
List<Span> matchingResults = new ArrayList<>();
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, check if the dictionary entries contains the complete fieldValue
if (attributeType == AttributeType.STRING) {
if (queryList.contains(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// for TEXT type, every token in the query should be present in span
if (attributeType == AttributeType.TEXT) {
for (int index : relevantSpansMap.keySet()) {
List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
matchingResults.addAll(fieldSpanList);
}
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class DataReader method constructTuple.
private Tuple constructTuple(int docID) throws IOException, ParseException {
Document luceneDocument = luceneIndexSearcher.doc(docID);
ArrayList<IField> docFields = documentToFields(luceneDocument);
if (payloadAdded) {
ArrayList<Span> payloadSpanList = buildPayloadFromTermVector(docFields, docID);
ListField<Span> payloadField = new ListField<Span>(payloadSpanList);
docFields.add(payloadField);
}
Tuple resultTuple = new Tuple(outputSchema, docFields.stream().toArray(IField[]::new));
return resultTuple;
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class NlpEntityOperator method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : predicate.getAttributeNames()) {
IField field = inputTuple.getField(attributeName);
matchingResults.addAll(extractNlpSpans(field, attributeName));
}
if (matchingResults.isEmpty()) {
return null;
}
return new Tuple.Builder(inputTuple).add(predicate.getResultAttribute(), AttributeType.LIST, new ListField<Span>(matchingResults)).build();
}
Aggregations