use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class TupleJsonDeserializer method deserialize.
@Override
public Tuple deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException {
JsonNode node = p.getCodec().readTree(p);
JsonNode schemaNode = node.get(JsonConstants.SCHEMA);
JsonNode fieldsNode = node.get(JsonConstants.FIELDS);
Schema schema = new ObjectMapper().treeToValue(schemaNode, Schema.class);
ArrayList<IField> fields = new ArrayList<>();
for (int i = 0; i < schema.getAttributes().size(); i++) {
AttributeType attributeType = schema.getAttributes().get(i).getAttributeType();
JsonNode fieldNode = fieldsNode.get(i);
IField field = new ObjectMapper().treeToValue(fieldNode, attributeType.getFieldClass());
fields.add(field);
}
return new Tuple(schema, fields);
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class DataReader method buildPayloadFromTermVector.
private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
ArrayList<Span> payloadSpanList = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
String attributeName = attr.getAttributeName();
AttributeType attributeType = attr.getAttributeType();
// payload.
if (attributeType != AttributeType.TEXT) {
continue;
}
String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
if (termVector == null) {
continue;
}
TermsEnum termsEnum = termVector.iterator();
PostingsEnum termPostings = null;
// go through document terms
while ((termsEnum.next()) != null) {
termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
// for each term, go through its postings
for (int i = 0; i < termPostings.freq(); i++) {
// nextPosition needs to be called first
int tokenPosition = termPostings.nextPosition();
int charStart = termPostings.startOffset();
int charEnd = termPostings.endOffset();
String analyzedTermStr = termsEnum.term().utf8ToString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
payloadSpanList.add(span);
}
}
}
return payloadSpanList;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcherSourceOperator method buildConjunctionQuery.
private Query buildConjunctionQuery() throws DataFlowException {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
}
if (attributeType == AttributeType.STRING) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
if (attributeType == AttributeType.TEXT) {
BooleanQuery.Builder fieldQueryBuilder = new BooleanQuery.Builder();
for (String token : queryTokenSet) {
Query termQuery = new TermQuery(new Term(attributeName, token.toLowerCase()));
fieldQueryBuilder.add(termQuery, BooleanClause.Occur.MUST);
}
booleanQueryBuilder.add(fieldQueryBuilder.build(), BooleanClause.Occur.SHOULD);
}
}
return booleanQueryBuilder.build();
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TextDBException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchResults = new ArrayList<>();
/*
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
*/
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataFlowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
}
List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
matchResults.addAll(fieldSpans);
}
}
if (matchResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchResults);
return inputTuple;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class NlpSentimentOperator method open.
@Override
public void open() throws TextDBException {
if (cursor != CLOSED) {
return;
}
if (inputOperator == null) {
throw new DataFlowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
}
inputOperator.open();
Schema inputSchema = inputOperator.getOutputSchema();
// check if input schema is present
if (!inputSchema.containsField(predicate.getInputAttributeName())) {
throw new RuntimeException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
}
// check if attribute type is valid
AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getAttributeType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new RuntimeException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
}
// generate output schema by transforming the input schema
outputSchema = transformSchema(inputOperator.getOutputSchema());
cursor = OPENED;
// setup NLP sentiment analysis pipeline
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
sentimentPipeline = new StanfordCoreNLP(props);
}
Aggregations