use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class NlpEntityTestConstants method getTest4ResultTuples.
public static List<Tuple> getTest4ResultTuples() {
List<Span> spanList = new ArrayList<Span>();
Span span1 = new Span("sentence_one", 0, 9, NlpEntityType.ORGANIZATION.toString(), "Microsoft");
Span span2 = new Span("sentence_one", 11, 17, NlpEntityType.ORGANIZATION.toString(), "Google");
Span span3 = new Span("sentence_one", 22, 30, NlpEntityType.ORGANIZATION.toString(), "Facebook");
Span span4 = new Span("sentence_two", 0, 12, NlpEntityType.PERSON.toString(), "Donald Trump");
Span span5 = new Span("sentence_two", 17, 29, NlpEntityType.PERSON.toString(), "Barack Obama");
spanList.add(span1);
spanList.add(span2);
spanList.add(span3);
spanList.add(span4);
spanList.add(span5);
IField[] fields1 = { new TextField("Microsoft, Google and Facebook are organizations."), new TextField("Donald Trump and Barack Obama are persons") };
Tuple tuple1 = new Tuple(SCHEMA_TWO_SENTENCE, fields1);
Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
return Arrays.asList(returnTuple);
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class NlpEntityTestConstants method getTest7ResultTuples.
public static List<Tuple> getTest7ResultTuples() {
List<Span> spanList = new ArrayList<Span>();
Span span1 = new Span("sentence_one", 12, 16, NlpEntityType.ADJECTIVE.toString(), "warm");
spanList.add(span1);
IField[] fields1 = { new TextField("Feeling the warm sun rays beaming steadily down, the girl decided there was no need to wear a coat.") };
Tuple tuple1 = new Tuple(SCHEMA_ONE_SENTENCE, fields1);
Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
return Arrays.asList(returnTuple);
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class SimilarityJoinPredicate method joinTuples.
@Override
public Tuple joinTuples(Tuple innerTuple, Tuple outerTuple, Schema outputSchema) throws DataflowException {
if (similarityThreshold == 0) {
return null;
}
// get the span list only with the joinAttributeName
ListField<Span> innerSpanListField = innerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> innerRelevantSpanList = innerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(innerJoinAttrName)).collect(Collectors.toList());
ListField<Span> outerSpanListField = outerTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> outerRelevantSpanList = outerSpanListField.getValue().stream().filter(span -> span.getAttributeName().equals(outerJoinAttrName)).collect(Collectors.toList());
// get a set of span's values (since multiple spans may have the same value)
Set<String> innerSpanValueSet = innerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
Set<String> outerSpanValueSet = outerRelevantSpanList.stream().map(span -> span.getValue()).collect(Collectors.toSet());
// compute the result value set using the similarity function
Set<String> resultValueSet = new HashSet<>();
for (String innerString : innerSpanValueSet) {
for (String outerString : outerSpanValueSet) {
if (this.similarityFunc.calculateSimilarity(innerString, outerString) >= this.similarityThreshold) {
resultValueSet.add(innerString);
resultValueSet.add(outerString);
}
}
}
// return null if none of them are similar
if (resultValueSet.isEmpty()) {
return null;
}
// generate the result spans
List<Span> resultSpans = new ArrayList<>();
for (Span span : innerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, INNER_PREFIX));
}
}
for (Span span : outerRelevantSpanList) {
if (resultValueSet.contains(span.getValue())) {
resultSpans.add(addFieldPrefix(span, OUTER_PREFIX));
}
}
return mergeTuples(innerTuple, outerTuple, outputSchema, resultSpans);
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getLuceneAnalyzerStr())));
inputTuple = tupleBuilderPayload.build();
}
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> relevantSpans = filterRelevantSpans(payloadField.getValue());
List<Span> matchingResults = new ArrayList<>();
/*
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
*/
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataflowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
}
List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
matchingResults.addAll(fieldSpans);
}
}
if (matchingResults.isEmpty()) {
return null;
}
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));
}
return tupleBuilder.build();
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class DictionaryMatcher method appendPhraseMatchingSpans4Dictionary.
public List<Span> appendPhraseMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<List<String>> queryTokenList, List<Set<String>> queryTokenSetList, List<List<String>> queryTokenListWithStopwords, List<String> queryList) throws DataflowException {
List<Span> matchingResults = new ArrayList<>();
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList);
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryList.contains(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// for TEXT type, spans need to be reconstructed according to the phrase query.
if (attributeType == AttributeType.TEXT) {
for (int index : relevantSpansMap.keySet()) {
List<Span> fieldSpanList = relevantSpansMap.get(index).stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (fieldSpanList.isEmpty() || !DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSetList.get(index))) {
continue;
}
matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryList.get(index), fieldSpanList, queryTokenListWithStopwords.get(index), queryTokenList.get(index)));
}
}
}
return matchingResults;
}
Aggregations