use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class KeywordMatcher method appendConjunctionMatchingSpans.
private List<Span> appendConjunctionMatchingSpans(Tuple inputTuple, List<String> attributeNames, Set<String> queryTokenSet, String queryKeyword) throws DataflowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryKeyword.equals(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// list for this field
if (attributeType == AttributeType.TEXT) {
List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
matchingResults.addAll(fieldSpanList);
}
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class KeywordMatcher method appendPhraseMatchingSpans.
private List<Span> appendPhraseMatchingSpans(Tuple inputTuple, List<String> attributeNames, List<String> queryTokenList, List<String> queryTokenListWithStopwords, String queryKeyword) throws DataflowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : attributeNames) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (queryKeyword.equals(fieldValue)) {
Span span = new Span(attributeName, 0, fieldValue.length(), fieldValue, fieldValue);
matchingResults.add(span);
}
}
// phrase query
if (attributeType == AttributeType.TEXT) {
Set<String> queryTokenSet = new HashSet<>(queryTokenList);
List<Span> relevantSpans = filterRelevantSpans(payload, queryTokenSet);
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (!DataflowUtils.isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
// in the spans
continue;
}
matchingResults.addAll(DataflowUtils.constructPhraseMatchingSpans(attributeName, fieldValue, queryKeyword, fieldSpanList, queryTokenListWithStopwords, queryTokenList));
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class NlpEntityTestConstants method getTest1ResultTuples.
public static List<Tuple> getTest1ResultTuples() {
List<Span> spanList = new ArrayList<Span>();
Span span1 = new Span("sentence_one", 0, 9, NlpEntityType.ORGANIZATION.toString(), "Microsoft");
spanList.add(span1);
IField[] fields1 = { new TextField("Microsoft is an organization.") };
Tuple tuple1 = new Tuple(SCHEMA_ONE_SENTENCE, fields1);
Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
return Arrays.asList(returnTuple);
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class NlpEntityTestConstants method getTest3ResultTuples.
public static List<Tuple> getTest3ResultTuples() {
List<Span> spanList = new ArrayList<Span>();
Span span1 = new Span("sentence_one", 0, 9, NlpEntityType.ORGANIZATION.toString(), "Microsoft");
Span span2 = new Span("sentence_one", 11, 17, NlpEntityType.ORGANIZATION.toString(), "Google");
Span span3 = new Span("sentence_one", 22, 30, NlpEntityType.ORGANIZATION.toString(), "Facebook");
Span span4 = new Span("sentence_one", 53, 65, NlpEntityType.PERSON.toString(), "Donald Trump");
Span span5 = new Span("sentence_one", 70, 82, NlpEntityType.PERSON.toString(), "Barack Obama");
spanList.add(span1);
spanList.add(span2);
spanList.add(span3);
spanList.add(span4);
spanList.add(span5);
IField[] fields1 = { new TextField("Microsoft, Google and Facebook are organizations and Donald Trump and Barack Obama are persons.") };
Tuple tuple1 = new Tuple(SCHEMA_ONE_SENTENCE, fields1);
Tuple returnTuple = new Tuple.Builder(tuple1).add(REULST_ATTRIBUTE, new ListField<Span>(spanList)).build();
return Arrays.asList(returnTuple);
}
use of edu.uci.ics.texera.api.field.ListField in project textdb by TextDB.
the class RegexMatcherTest method testGetNextTupleCorpURL.
@Test
public void testGetNextTupleCorpURL() throws Exception {
String query = "^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$";
List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(CORP_TABLE, query, Arrays.asList(RegexTestConstantsCorp.URL));
List<Tuple> expectedResults = new ArrayList<Tuple>();
// expected to match "http://weibo.com"
List<Tuple> data = RegexTestConstantsCorp.getSampleCorpTuples();
Schema spanSchema = new Schema.Builder().add(RegexTestConstantsCorp.SCHEMA_CORP).add(RESULTS, AttributeType.LIST).build();
List<Span> spans = new ArrayList<Span>();
spans.add(new Span(RegexTestConstantsCorp.URL, 0, 16, query, "http://weibo.com"));
IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
List<IField> fields = new ArrayList<IField>(data.get(1).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
// expected to match "https://www.microsoft.com/en-us/"
spans.clear();
spans.add(new Span(RegexTestConstantsCorp.URL, 0, 32, query, "https://www.microsoft.com/en-us/"));
spanField = new ListField<Span>(new ArrayList<Span>(spans));
fields = new ArrayList<IField>(data.get(2).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
Aggregations