use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class JoinDistancePredicate method generateIntersectionSchema.
/**
* Create outputSchema, which is the intersection of innerOperator's schema and outerOperator's schema.
* The attributes have to be exactly the same (name and type) to be intersected.
*
* InnerOperator's attributes and outerOperator's attributes must:
* both contain the attributes to be joined.
* both contain "_ID" attribute.
* both contain "spanList" attribute.
*
* @return outputSchema
*/
private Schema generateIntersectionSchema(Schema innerOperatorSchema, Schema outerOperatorSchema) throws DataFlowException {
List<Attribute> innerAttributes = innerOperatorSchema.getAttributes();
List<Attribute> outerAttributes = outerOperatorSchema.getAttributes();
List<Attribute> intersectionAttributes = innerAttributes.stream().filter(attr -> outerAttributes.contains(attr)).collect(Collectors.toList());
Schema intersectionSchema = new Schema(intersectionAttributes.stream().toArray(Attribute[]::new));
// check if output schema contain necessary attributes
if (intersectionSchema.getAttributes().isEmpty()) {
throw new DataFlowException("inner operator and outer operator don't share any common attributes");
} else if (intersectionSchema.getAttribute(this.joinAttributeName) == null) {
throw new DataFlowException("inner operator or outer operator doesn't contain join attribute");
} else if (intersectionSchema.getAttribute(SchemaConstants._ID) == null) {
throw new DataFlowException("inner operator or outer operator doesn't contain _ID attribute");
} else if (intersectionSchema.getAttribute(SchemaConstants.SPAN_LIST) == null) {
throw new DataFlowException("inner operator or outer operator doesn't contain spanList attribute");
}
// check if join attribute is TEXT or STRING
AttributeType joinAttrType = intersectionSchema.getAttribute(this.joinAttributeName).getAttributeType();
if (joinAttrType != AttributeType.TEXT && joinAttrType != AttributeType.STRING) {
throw new DataFlowException(String.format("Join attribute %s must be either TEXT or STRING.", this.joinAttributeName));
}
return intersectionSchema;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcher method computeSubstringMatchingResult.
private List<Span> computeSubstringMatchingResult(Tuple inputTuple) throws DataFlowException {
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
// for STRING type, the query should match the fieldValue completely
if (attributeType == AttributeType.STRING) {
if (fieldValue.equals(predicate.getQuery())) {
matchingResults.add(new Span(attributeName, 0, predicate.getQuery().length(), predicate.getQuery(), fieldValue));
}
}
if (attributeType == AttributeType.TEXT) {
String regex = predicate.getQuery().toLowerCase();
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(fieldValue.toLowerCase());
while (matcher.find()) {
int start = matcher.start();
int end = matcher.end();
matchingResults.add(new Span(attributeName, start, end, predicate.getQuery(), fieldValue.substring(start, end)));
}
}
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class KeywordMatcherSourceOperator method buildPhraseQuery.
private Query buildPhraseQuery() throws DataFlowException {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
}
if (attributeType == AttributeType.STRING) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
if (attributeType == AttributeType.TEXT) {
if (queryTokenList.size() == 1) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
} else {
PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
}
}
PhraseQuery phraseQuery = phraseQueryBuilder.build();
booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
}
}
}
return booleanQueryBuilder.build();
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class RegexSplitOperator method populateOutputBuffer.
// If the regex does not have any match in the tuple, we return the whole string as the result.
private void populateOutputBuffer(Tuple inputTuple) throws TextDBException {
if (inputTuple == null) {
return;
}
AttributeType attributeType = this.inputSchema.getAttribute(predicate.getAttributeToSplit()).getAttributeType();
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
return;
}
String strToSplit = inputTuple.getField(predicate.getAttributeToSplit()).getValue().toString();
List<String> stringList = splitText(strToSplit);
outputTupleBuffer = new ArrayList<>();
for (String singleMatch : stringList) {
List<IField> tupleFieldList = new ArrayList<>();
// Generate the new UUID.
tupleFieldList.add(IDField.newRandomID());
for (String attributeName : inputSchema.getAttributeNames()) {
// Remove the old ID.
if (attributeName.equals(SchemaConstants._ID)) {
continue;
}
if (attributeName.equals(predicate.getAttributeToSplit())) {
if (attributeType == AttributeType.TEXT) {
tupleFieldList.add(new TextField(singleMatch));
} else {
tupleFieldList.add(new StringField(singleMatch));
}
} else {
tupleFieldList.add(inputTuple.getField(attributeName));
}
}
outputTupleBuffer.add(new Tuple(outputSchema, tupleFieldList.stream().toArray(IField[]::new)));
}
}
use of edu.uci.ics.textdb.api.schema.AttributeType in project textdb by TextDB.
the class DataReader method documentToFields.
private ArrayList<IField> documentToFields(Document luceneDocument) throws ParseException {
ArrayList<IField> fields = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
AttributeType attributeType = attr.getAttributeType();
String fieldValue = luceneDocument.get(attr.getAttributeName());
fields.add(StorageUtils.getField(attributeType, fieldValue));
}
return fields;
}
Aggregations