use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class NltkSentimentOperator method open.
public void open() throws TexeraException {
if (cursor != CLOSED) {
if (inputOperator == null) {
throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
Schema inputSchema = inputOperator.getOutputSchema();
// check if the input schema is presented
if (!inputSchema.containsAttribute(predicate.getInputAttributeName())) {
throw new TexeraException(String.format("input attribute %s is not in the input schema %s", predicate.getInputAttributeName(), inputSchema.getAttributeNames()));
// check if the attribute type is valid
AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new TexeraException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
// generate output schema by transforming the input schema
outputSchema = transformSchema(inputOperator.getOutputSchema());
cursor = OPENED;
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class DataReader method documentToFields.
private ArrayList<IField> documentToFields(Document luceneDocument) throws ParseException {
ArrayList<IField> fields = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
AttributeType attributeType = attr.getType();
String fieldValue = luceneDocument.get(attr.getName());
fields.add(StorageUtils.getField(attributeType, fieldValue));
return fields;
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class DataReader method buildPayloadFromTermVector.
private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
ArrayList<Span> payloadSpanList = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
String attributeName = attr.getName();
AttributeType attributeType = attr.getType();
// payload.
if (attributeType != AttributeType.TEXT) {
String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
if (termVector == null) {
TermsEnum termsEnum = termVector.iterator();
PostingsEnum termPostings = null;
// go through document terms
while (( != null) {
termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
// for each term, go through its postings
for (int i = 0; i < termPostings.freq(); i++) {
// nextPosition needs to be called first
int tokenPosition = termPostings.nextPosition();
int charStart = termPostings.startOffset();
int charEnd = termPostings.endOffset();
String analyzedTermStr = termsEnum.term().utf8ToString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
return payloadSpanList;
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class TupleJsonDeserializer method deserialize.
public Tuple deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException {
JsonNode node = p.getCodec().readTree(p);
JsonNode schemaNode = node.get(JsonConstants.SCHEMA);
JsonNode fieldsNode = node.get(JsonConstants.FIELDS);
Schema schema = new ObjectMapper().treeToValue(schemaNode, Schema.class);
ArrayList<IField> fields = new ArrayList<>();
for (int i = 0; i < schema.getAttributes().size(); i++) {
AttributeType attributeType = schema.getAttributes().get(i).getType();
JsonNode fieldNode = fieldsNode.get(i);
IField field = new ObjectMapper().treeToValue(fieldNode, attributeType.getFieldClass());
return new Tuple(schema, fields);
use of edu.uci.ics.texera.api.schema.AttributeType in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException {
// add payload if needed before passing it to the matching functions
if (addPayload) {
Tuple.Builder tupleBuilderPayload = new Tuple.Builder(inputTuple);
tupleBuilderPayload.add(SchemaConstants.PAYLOAD_ATTRIBUTE, new ListField<Span>(DataflowUtils.generatePayloadFromTuple(inputTuple, predicate.getLuceneAnalyzerStr())));
inputTuple =;
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> relevantSpans = filterRelevantSpans(payloadField.getValue());
List<Span> matchingResults = new ArrayList<>();
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataflowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
List<Span> fieldSpans = -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
if (matchingResults.isEmpty()) {
return null;
Tuple.Builder tupleBuilder = new Tuple.Builder(inputTuple);
if (addResultAttribute) {
tupleBuilder.add(predicate.getSpanListName(), AttributeType.LIST, new ListField<Span>(matchingResults));