use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.
the class DataReader method buildPayloadFromTermVector.
private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
ArrayList<Span> payloadSpanList = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
String attributeName = attr.getName();
AttributeType attributeType = attr.getType();
// payload.
if (attributeType != AttributeType.TEXT) {
continue;
}
String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
if (termVector == null) {
continue;
}
TermsEnum termsEnum = termVector.iterator();
PostingsEnum termPostings = null;
// go through document terms
while ((termsEnum.next()) != null) {
termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
// for each term, go through its postings
for (int i = 0; i < termPostings.freq(); i++) {
// nextPosition needs to be called first
int tokenPosition = termPostings.nextPosition();
int charStart = termPostings.startOffset();
int charEnd = termPostings.endOffset();
String analyzedTermStr = termsEnum.term().utf8ToString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
payloadSpanList.add(span);
}
}
}
return payloadSpanList;
}
use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.
the class SpanTupleTest method createSpanListField.
private IField createSpanListField() {
List<Span> list = new ArrayList<Span>();
// The key value will be:
// For RegexMatcher : "n.*k"
// For NamedEntityMatcher : LOCATION
// For DictionaryMatcher: "new york" - For DictionaryMatcher the key and
// value are same
// For KeyWordMatcher: "new york" - the value can be "new" or "york"
Span span1 = new Span("description", 18, 26, "LOCATION", "new york");
Span span2 = new Span("description", 52, 63, "LOCATION", "los angeles");
list.add(span1);
list.add(span2);
IField spanListField = new ListField<Span>(list);
return spanListField;
}
use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.
the class SpanTest method testGetters.
@Test
public void testGetters() {
String attributeName = "description";
int start = 10;
int end = 20;
String key = "location";
String value = "new york";
span = new Span(attributeName, start, end, key, value);
Assert.assertEquals(start, span.getStart());
Assert.assertEquals(end, span.getEnd());
Assert.assertEquals(key, span.getKey());
Assert.assertEquals(value, span.getValue());
Assert.assertEquals(attributeName, span.getAttributeName());
}
use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.
the class JsonSerializationTest method testTupleWithSpanlist.
@Test
public void testTupleWithSpanlist() {
Tuple tuple = new Tuple.Builder().add("attr1", AttributeType.TEXT, new TextField("test")).add("spanList", AttributeType.LIST, new ListField<Span>(Arrays.asList(new Span("attr1", 0, 4, "test", "test")))).build();
TestUtils.testJsonSerialization(tuple);
}
use of edu.uci.ics.texera.api.span.Span in project textdb by TextDB.
the class FuzzyTokenMatcherPerformanceTest method match.
/*
* This function does match for a list of queries
*/
public static void match(ArrayList<String> queryList, double threshold, String luceneAnalyzerStr, String tableName, boolean bool) throws TexeraException, IOException {
List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT);
for (String query : queryList) {
FuzzyTokenSourcePredicate predicate = new FuzzyTokenSourcePredicate(query, attributeNames, luceneAnalyzerStr, threshold, tableName, SchemaConstants.SPAN_LIST);
FuzzyTokenMatcherSourceOperator fuzzyTokenSource = new FuzzyTokenMatcherSourceOperator(predicate);
long startMatchTime = System.currentTimeMillis();
fuzzyTokenSource.open();
int counter = 0;
Tuple nextTuple = null;
while ((nextTuple = fuzzyTokenSource.getNextTuple()) != null) {
ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST);
List<Span> spanList = spanListField.getValue();
counter += spanList.size();
}
fuzzyTokenSource.close();
long endMatchTime = System.currentTimeMillis();
double matchTime = (endMatchTime - startMatchTime) / 1000.0;
timeResults.add(Double.parseDouble(String.format("%.4f", matchTime)));
totalResultCount += counter;
}
}
Aggregations