use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TextDBException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchResults = new ArrayList<>();
/*
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
*/
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataFlowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
}
List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
matchResults.addAll(fieldSpans);
}
}
if (matchResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchResults);
return inputTuple;
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class KeywordMatcher method computeConjunctionMatchingResult.
private List<Span> computeConjunctionMatchingResult(Tuple inputTuple) throws DataFlowException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
Span span = new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue);
matchingResults.add(span);
}
}
// list for this field
if (attributeType == AttributeType.TEXT) {
List<Span> fieldSpanList = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).collect(Collectors.toList());
if (isAllQueryTokensPresent(fieldSpanList, queryTokenSet)) {
matchingResults.addAll(fieldSpanList);
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class DataReader method constructTuple.
private Tuple constructTuple(int docID) throws IOException, ParseException {
Document luceneDocument = luceneIndexSearcher.doc(docID);
ArrayList<IField> docFields = documentToFields(luceneDocument);
if (payloadAdded) {
ArrayList<Span> payloadSpanList = buildPayloadFromTermVector(docFields, docID);
ListField<Span> payloadField = new ListField<Span>(payloadSpanList);
docFields.add(payloadField);
}
Tuple resultTuple = new Tuple(outputSchema, docFields.stream().toArray(IField[]::new));
return resultTuple;
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class RegexMatcherTest method testRegexText1.
@Test
public void testRegexText1() throws Exception {
String query = "test(er|ing|ed|s)?";
List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(TEXT_TABLE, query, Arrays.asList(RegexTestConstantsText.CONTENT));
List<Tuple> expectedResults = new ArrayList<Tuple>();
// expected to match "test" & testing"
List<Tuple> data = RegexTestConstantsText.getSampleTextTuples();
Schema spanSchema = Utils.addAttributeToSchema(RegexTestConstantsText.SCHEMA_TEXT, new Attribute(RESULTS, AttributeType.LIST));
List<Span> spans = new ArrayList<Span>();
spans.add(new Span(RegexTestConstantsText.CONTENT, 5, 9, query, "test"));
spans.add(new Span(RegexTestConstantsText.CONTENT, 21, 28, query, "testing"));
IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
List<IField> fields = new ArrayList<IField>(data.get(0).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
// expected to match "tests"
spans.clear();
spans.add(new Span(RegexTestConstantsText.CONTENT, 87, 92, query, "tests"));
spanField = new ListField<Span>(new ArrayList<Span>(spans));
fields = new ArrayList<IField>(data.get(2).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
// expected to match "tested"
spans.clear();
spans.add(new Span(RegexTestConstantsText.CONTENT, 43, 49, query, "tested"));
spanField = new ListField<Span>(new ArrayList<Span>(spans));
fields = new ArrayList<IField>(data.get(3).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
use of edu.uci.ics.textdb.api.field.ListField in project textdb by TextDB.
the class RegexMatcherTest method testRegexText3.
@Test
public void testRegexText3() throws Exception {
String query = "([a-zA-Z])+o[a-z]a[a-z]o";
List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults(TEXT_TABLE, query, Arrays.asList(RegexTestConstantsText.CONTENT));
List<Tuple> expectedResults = new ArrayList<Tuple>();
// expected to match "Tomato" & "tomato"
List<Tuple> data = RegexTestConstantsText.getSampleTextTuples();
Schema spanSchema = Utils.addAttributeToSchema(RegexTestConstantsText.SCHEMA_TEXT, new Attribute(RESULTS, AttributeType.LIST));
List<Span> spans = new ArrayList<Span>();
spans.add(new Span(RegexTestConstantsText.CONTENT, 0, 6, query, "Tomato"));
spans.add(new Span(RegexTestConstantsText.CONTENT, 94, 100, query, "tomato"));
IField spanField = new ListField<Span>(new ArrayList<Span>(spans));
List<IField> fields = new ArrayList<IField>(data.get(7).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
// expected to match "Potato"
spans.clear();
spans.add(new Span(RegexTestConstantsText.CONTENT, 0, 6, query, "Potato"));
spanField = new ListField<Span>(new ArrayList<Span>(spans));
fields = new ArrayList<IField>(data.get(8).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
// expected to match "avocado"
spans.clear();
spans.add(new Span(RegexTestConstantsText.CONTENT, 53, 60, query, "avocado"));
spanField = new ListField<Span>(new ArrayList<Span>(spans));
fields = new ArrayList<IField>(data.get(9).getFields());
fields.add(spanField);
expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()])));
Assert.assertTrue(TestUtils.equals(expectedResults, exactResults));
}
Aggregations