use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class WordCountIndexSource method computeWordCount.
private void computeWordCount() throws TextDBException {
try {
HashMap<String, Integer> wordCountMap = new HashMap<>();
DataReader dataReader = RelationManager.getRelationManager().getTableDataReader(predicate.getTableName(), new MatchAllDocsQuery());
dataReader.open();
IndexReader luceneIndexReader = dataReader.getLuceneIndexReader();
for (int i = 0; i < luceneIndexReader.numDocs(); i++) {
Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());
TermsEnum termsEnum = termVector.iterator();
while (termsEnum.next() != null) {
String key = termsEnum.term().utf8ToString();
wordCountMap.put(key, wordCountMap.get(key) == null ? ((int) termsEnum.totalTermFreq()) : wordCountMap.get(key) + ((int) termsEnum.totalTermFreq()));
}
}
luceneIndexReader.close();
dataReader.close();
sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
wordCountIterator = sortedWordCountMap.iterator();
} catch (IOException e) {
throw new DataFlowException(e);
}
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class ScanBasedSourceOperator method close.
@Override
public void close() throws TextDBException {
if (!isOpen) {
return;
}
try {
dataReader.close();
isOpen = false;
} catch (Exception e) {
throw new DataFlowException(e.getMessage(), e);
}
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class ExcelSink method open.
@Override
public void open() throws TextDBException {
if (cursor != CLOSED) {
return;
}
inputOperator.open();
inputSchema = inputOperator.getOutputSchema();
outputSchema = new Schema(inputSchema.getAttributes().stream().filter(attr -> !attr.getAttributeName().equalsIgnoreCase(SchemaConstants._ID)).filter(attr -> !attr.getAttributeName().equalsIgnoreCase(SchemaConstants.PAYLOAD)).filter(attr -> !attr.getAttributeType().equals(AttributeType.LIST)).toArray(Attribute[]::new));
wb = new XSSFWorkbook();
DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
fileName = df.format(new Date()) + ".xlsx";
try {
if (Files.notExists(Paths.get(excelIndexDirectory))) {
Files.createDirectories(Paths.get(excelIndexDirectory));
}
fileOut = new FileOutputStream(Paths.get(excelIndexDirectory, fileName).toString());
} catch (IOException e) {
throw new DataFlowException(e);
}
sheet = wb.createSheet("new sheet");
Row row = sheet.createRow(0);
List<String> attributeNames = outputSchema.getAttributeNames();
for (int i = 0; i < attributeNames.size(); i++) {
String attributeName = attributeNames.get(i);
row.createCell(i).setCellValue(attributeName);
}
cursor = OPENED;
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class RelationManager method createTable.
/**
* Creates a new table.
* Table name must be unique (case insensitive).
* LuceneAnalyzer must be a valid analyzer string.
*
* The "_id" attribute will be added to the table schema.
* System automatically generates a unique ID for each tuple inserted to a table,
* the generated ID will be in "_id" field.
*
* @param tableName, the name of the table, must be unique, case is not sensitive
* @param indexDirectory, the directory to store the index and data, must not duplicate with other tables' directories
* @param schema, the schema of the table
* @param luceneAnalyzerString, the string representing the lucene analyzer used
* @throws StorageException
*/
public void createTable(String tableName, String indexDirectory, Schema schema, String luceneAnalyzerString) throws StorageException {
// convert the table name to lower case
tableName = tableName.toLowerCase();
// table should not exist
if (checkTableExistence(tableName)) {
throw new StorageException(String.format("Table %s already exists.", tableName));
}
// and convert the index directory to its absolute path
try {
Path indexPath = Paths.get(indexDirectory);
if (Files.notExists(indexPath)) {
Files.createDirectories(indexPath);
}
indexDirectory = indexPath.toRealPath().toString();
} catch (IOException e) {
throw new StorageException(e);
}
// check if the indexDirectory overlaps with another table's index directory
Query indexDirectoryQuery = new TermQuery(new Term(CatalogConstants.TABLE_DIRECTORY, indexDirectory));
DataReader tableCatalogDataReader = new DataReader(CatalogConstants.TABLE_CATALOG_DATASTORE, indexDirectoryQuery);
tableCatalogDataReader.setPayloadAdded(false);
tableCatalogDataReader.open();
Tuple nextTuple = tableCatalogDataReader.getNextTuple();
tableCatalogDataReader.close();
// if the index directory is already taken by another table, throws an exception
if (nextTuple != null) {
String overlapTableName = nextTuple.getField(CatalogConstants.TABLE_NAME).getValue().toString();
throw new StorageException(String.format("Table %s already takes the index directory %s. Please choose another directory.", overlapTableName, indexDirectory));
}
// check if the lucene analyzer string is valid
Analyzer luceneAnalyzer = null;
try {
luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerString);
} catch (DataFlowException e) {
throw new StorageException("Lucene Analyzer String is not valid.");
}
// create the directory and clear all data in the index directory
Schema tableSchema = Utils.getSchemaWithID(schema);
DataStore tableDataStore = new DataStore(indexDirectory, tableSchema);
DataWriter dataWriter = new DataWriter(tableDataStore, luceneAnalyzer);
dataWriter.open();
dataWriter.clearData();
dataWriter.close();
// write table info to catalog
writeTableInfoToCatalog(tableName, indexDirectory, schema, luceneAnalyzerString);
}
use of edu.uci.ics.textdb.api.exception.DataFlowException in project textdb by TextDB.
the class KeywordMatcherSourceOperator method buildConjunctionQuery.
private Query buildConjunctionQuery() throws DataFlowException {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
}
if (attributeType == AttributeType.STRING) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
if (attributeType == AttributeType.TEXT) {
BooleanQuery.Builder fieldQueryBuilder = new BooleanQuery.Builder();
for (String token : queryTokenSet) {
Query termQuery = new TermQuery(new Term(attributeName, token.toLowerCase()));
fieldQueryBuilder.add(termQuery, BooleanClause.Occur.MUST);
}
booleanQueryBuilder.add(fieldQueryBuilder.build(), BooleanClause.Occur.SHOULD);
}
}
return booleanQueryBuilder.build();
}
Aggregations