use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class NltkSentimentOperator method computeClassLabel.
// Process the data file using NLTK
private String computeClassLabel(String filePath) {
try {
/*
* In order to use the NLTK package to do classification, we start a
* new process to run the package, and wait for the result of running
* the process as the class label of this text field.
* Python call format:
* #python3 nltk_sentiment_classify picklePath dataPath resultPath
* */
List<String> args = new ArrayList<String>(Arrays.asList(PYTHON, PYTHONSCRIPT, PicklePath, filePath, resultPath));
ProcessBuilder processBuilder = new ProcessBuilder(args);
Process p = processBuilder.start();
p.waitFor();
// Read label result from file generated by Python.
CSVReader csvReader = new CSVReader(new FileReader(resultPath), SEPARATOR, QUOTECHAR, 1);
List<String[]> allRows = csvReader.readAll();
idClassMap = new HashMap<String, Integer>();
// Read CSV line by line
for (String[] row : allRows) {
try {
idClassMap.put(row[0], Integer.parseInt(row[1]));
} catch (NumberFormatException e) {
idClassMap.put(row[0], 0);
}
}
csvReader.close();
} catch (Exception e) {
throw new DataflowException(e.getMessage(), e);
}
return null;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class NltkSentimentOperator method computeTupleBuffer.
private boolean computeTupleBuffer() {
tupleBuffer = new ArrayList<Tuple>();
// write [ID,text] to a CSV file.
List<String[]> csvData = new ArrayList<>();
int i = 0;
while (i < predicate.getBatchSize()) {
Tuple inputTuple;
if ((inputTuple = inputOperator.getNextTuple()) != null) {
tupleBuffer.add(inputTuple);
String[] idTextPair = new String[2];
idTextPair[0] = inputTuple.getField(SchemaConstants._ID).getValue().toString();
idTextPair[1] = inputTuple.<IField>getField(predicate.getInputAttributeName()).getValue().toString();
csvData.add(idTextPair);
i++;
} else {
break;
}
}
if (tupleBuffer.isEmpty()) {
return false;
}
try {
if (Files.notExists(Paths.get(BatchedFiles))) {
Files.createFile(Paths.get(BatchedFiles));
}
CSVWriter writer = new CSVWriter(new FileWriter(BatchedFiles));
writer.writeAll(csvData);
writer.close();
} catch (IOException e) {
throw new DataflowException(e.getMessage(), e);
}
return true;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class NlpSplitOperator method open.
@Override
public void open() throws TexeraException {
if (cursor != CLOSED) {
return;
}
if (inputOperator == null) {
throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
}
inputOperator.open();
Schema inputSchema = inputOperator.getOutputSchema();
// generate output schema by transforming the input schema based on what output format
// is chosen (OneToOne vs. OneToMany)
outputSchema = transformSchema(inputOperator.getOutputSchema());
// check if attribute type is valid
AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
if (!isValidType) {
throw new DataflowException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
}
cursor = OPENED;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class RegexMatcher method computeMatchingResultsWithPattern.
public static List<Span> computeMatchingResultsWithPattern(Tuple inputTuple, RegexPredicate predicate, Pattern pattern) {
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
Matcher javaMatcher = pattern.matcher(fieldValue);
while (javaMatcher.find()) {
int start = javaMatcher.start();
int end = javaMatcher.end();
matchingResults.add(new Span(attributeName, start, end, predicate.getRegex(), fieldValue.substring(start, end)));
}
}
return matchingResults;
}
use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.
the class RegexMatcher method setUp.
@Override
protected void setUp() throws DataflowException {
if (inputOperator == null) {
throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
}
Schema inputSchema = inputOperator.getOutputSchema();
this.addResultAttribute = predicate.getSpanListName() != null;
Schema.checkAttributeExists(inputSchema, predicate.getAttributeNames());
if (addResultAttribute) {
Schema.checkAttributeNotExists(inputSchema, predicate.getSpanListName());
}
Schema.Builder outputSchemaBuilder = new Schema.Builder(inputOperator.getOutputSchema());
if (addResultAttribute) {
outputSchemaBuilder.add(predicate.getSpanListName(), AttributeType.LIST);
}
outputSchema = outputSchemaBuilder.build();
findRegexType();
// Check if labeled or unlabeled
if (this.regexType == RegexType.NO_LABELS) {
regexPattern = predicate.isIgnoreCase() ? Pattern.compile(predicate.getRegex(), Pattern.CASE_INSENSITIVE) : Pattern.compile(predicate.getRegex());
} else if (this.regexType == RegexType.LABELED_WITH_QUALIFIERS) {
labeledRegexProcessor = new LabeledRegexProcessor(predicate);
} else {
labledRegexNoQualifierProcessor = new LabledRegexNoQualifierProcessor(predicate);
}
}
Aggregations