use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class ScanBasedSourceOperator method close.
@Override
public void close() throws TexeraException {
if (!isOpen) {
return;
}
try {
dataReader.close();
isOpen = false;
} catch (Exception e) {
throw new DataflowException(e.getMessage(), e);
}
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class TwitterConverter method getNextTuple.
@Override
public Tuple getNextTuple() throws TexeraException {
if (cursor == CLOSED) {
throw new DataflowException(ErrorMessages.OPERATOR_NOT_OPENED);
}
Tuple tuple;
while ((tuple = inputOperator.getNextTuple()) != null) {
List<IField> tweetFields = generateFieldsFromJson(tuple.getField(rawDataAttribute).getValue().toString());
if (!tweetFields.isEmpty()) {
cursor++;
List<IField> tupleFields = new ArrayList<>();
final Tuple finalTuple = tuple;
tupleFields.addAll(tuple.getSchema().getAttributeNames().stream().filter(attrName -> !attrName.equalsIgnoreCase(rawDataAttribute)).map(attrName -> finalTuple.getField(attrName, IField.class)).collect(Collectors.toList()));
tupleFields.addAll(tweetFields);
return new Tuple(outputSchema, tupleFields);
}
}
return null;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class DataflowUtils method tokenizeQueryWithStopwords.
public static ArrayList<String> tokenizeQueryWithStopwords(String luceneAnalyzerStr, String query) {
Analyzer luceneAnalyzer;
if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.standardAnalyzerString())) {
// use an empty stop word list for standard analyzer
CharArraySet emptyStopwords = new CharArraySet(1, true);
luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
} else if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.chineseAnalyzerString())) {
// use the default smart chinese analyzer
// because the smart chinese analyzer's default stopword list is simply a list of punctuations
// https://lucene.apache.org/core/5_5_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html
luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerStr);
} else {
throw new TexeraException("tokenizeQueryWithStopwords: analyzer " + luceneAnalyzerStr + " not recgonized");
}
ArrayList<String> result = new ArrayList<String>();
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = term.toString();
int tokenIndex = query.toLowerCase().indexOf(token);
// Since tokens are converted to lower case,
// get the exact token from the query string.
String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
result.add(actualQueryToken);
}
tokenStream.close();
} catch (IOException e) {
throw new DataflowException(e);
} finally {
luceneAnalyzer.close();
}
return result;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class DataflowUtils method generatePayload.
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
List<Span> payload = new ArrayList<>();
try {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
int tokenPositionCounter = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
int tokenPosition = tokenPositionCounter;
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String analyzedTermStr = charTermAttribute.toString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
}
tokenStream.close();
} catch (IOException e) {
throw new DataflowException(e);
}
return payload;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class WordCountIndexSource method computeWordCount.
private void computeWordCount() throws TexeraException {
try {
HashMap<String, Integer> wordCountMap = new HashMap<>();
DataReader dataReader = RelationManager.getInstance().getTableDataReader(predicate.getTableName(), new MatchAllDocsQuery());
dataReader.open();
IndexReader luceneIndexReader = dataReader.getLuceneIndexReader();
for (int i = 0; i < luceneIndexReader.numDocs(); i++) {
Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());
TermsEnum termsEnum = termVector.iterator();
while (termsEnum.next() != null) {
String key = termsEnum.term().utf8ToString();
wordCountMap.put(key, wordCountMap.get(key) == null ? ((int) termsEnum.totalTermFreq()) : wordCountMap.get(key) + ((int) termsEnum.totalTermFreq()));
}
}
luceneIndexReader.close();
dataReader.close();
sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
wordCountIterator = sortedWordCountMap.iterator();
} catch (IOException e) {
throw new DataflowException(e);
}
}
Aggregations