Search in sources :

Example 41 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class NltkSentimentOperator method computeClassLabel.

// Process the data file using NLTK
private String computeClassLabel(String filePath) {
    try {
        /*
             *  In order to use the NLTK package to do classification, we start a
             *  new process to run the package, and wait for the result of running
             *  the process as the class label of this text field.
             *  Python call format:
             *      #python3 nltk_sentiment_classify picklePath dataPath resultPath
             * */
        List<String> args = new ArrayList<String>(Arrays.asList(PYTHON, PYTHONSCRIPT, PicklePath, filePath, resultPath));
        ProcessBuilder processBuilder = new ProcessBuilder(args);
        Process p = processBuilder.start();
        p.waitFor();
        // Read label result from file generated by Python.
        CSVReader csvReader = new CSVReader(new FileReader(resultPath), SEPARATOR, QUOTECHAR, 1);
        List<String[]> allRows = csvReader.readAll();
        idClassMap = new HashMap<String, Integer>();
        // Read CSV line by line
        for (String[] row : allRows) {
            try {
                idClassMap.put(row[0], Integer.parseInt(row[1]));
            } catch (NumberFormatException e) {
                idClassMap.put(row[0], 0);
            }
        }
        csvReader.close();
    } catch (Exception e) {
        throw new DataflowException(e.getMessage(), e);
    }
    return null;
}
Also used : CSVReader(au.com.bytecode.opencsv.CSVReader) ArrayList(java.util.ArrayList) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) IOException(java.io.IOException) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) FileReader(java.io.FileReader)

Example 42 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class NltkSentimentOperator method computeTupleBuffer.

private boolean computeTupleBuffer() {
    tupleBuffer = new ArrayList<Tuple>();
    // write [ID,text] to a CSV file.
    List<String[]> csvData = new ArrayList<>();
    int i = 0;
    while (i < predicate.getBatchSize()) {
        Tuple inputTuple;
        if ((inputTuple = inputOperator.getNextTuple()) != null) {
            tupleBuffer.add(inputTuple);
            String[] idTextPair = new String[2];
            idTextPair[0] = inputTuple.getField(SchemaConstants._ID).getValue().toString();
            idTextPair[1] = inputTuple.<IField>getField(predicate.getInputAttributeName()).getValue().toString();
            csvData.add(idTextPair);
            i++;
        } else {
            break;
        }
    }
    if (tupleBuffer.isEmpty()) {
        return false;
    }
    try {
        if (Files.notExists(Paths.get(BatchedFiles))) {
            Files.createFile(Paths.get(BatchedFiles));
        }
        CSVWriter writer = new CSVWriter(new FileWriter(BatchedFiles));
        writer.writeAll(csvData);
        writer.close();
    } catch (IOException e) {
        throw new DataflowException(e.getMessage(), e);
    }
    return true;
}
Also used : FileWriter(java.io.FileWriter) ArrayList(java.util.ArrayList) CSVWriter(au.com.bytecode.opencsv.CSVWriter) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException) Tuple(edu.uci.ics.texera.api.tuple.Tuple)

Example 43 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class NlpSplitOperator method open.

@Override
public void open() throws TexeraException {
    if (cursor != CLOSED) {
        return;
    }
    if (inputOperator == null) {
        throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    inputOperator.open();
    Schema inputSchema = inputOperator.getOutputSchema();
    // generate output schema by transforming the input schema based on what output format
    // is chosen (OneToOne vs. OneToMany)
    outputSchema = transformSchema(inputOperator.getOutputSchema());
    // check if attribute type is valid
    AttributeType inputAttributeType = inputSchema.getAttribute(predicate.getInputAttributeName()).getType();
    boolean isValidType = inputAttributeType.equals(AttributeType.STRING) || inputAttributeType.equals(AttributeType.TEXT);
    if (!isValidType) {
        throw new DataflowException(String.format("input attribute %s must have type String or Text, its actual type is %s", predicate.getInputAttributeName(), inputAttributeType));
    }
    cursor = OPENED;
}
Also used : AttributeType(edu.uci.ics.texera.api.schema.AttributeType) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 44 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class RegexMatcher method computeMatchingResultsWithPattern.

public static List<Span> computeMatchingResultsWithPattern(Tuple inputTuple, RegexPredicate predicate, Pattern pattern) {
    List<Span> matchingResults = new ArrayList<>();
    for (String attributeName : predicate.getAttributeNames()) {
        AttributeType attributeType = inputTuple.getSchema().getAttribute(attributeName).getType();
        String fieldValue = inputTuple.getField(attributeName).getValue().toString();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
        }
        Matcher javaMatcher = pattern.matcher(fieldValue);
        while (javaMatcher.find()) {
            int start = javaMatcher.start();
            int end = javaMatcher.end();
            matchingResults.add(new Span(attributeName, start, end, predicate.getRegex(), fieldValue.substring(start, end)));
        }
    }
    return matchingResults;
}
Also used : Matcher(java.util.regex.Matcher) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Span(edu.uci.ics.texera.api.span.Span)

Example 45 with DataflowException

use of edu.uci.ics.texera.api.exception.DataflowException in project textdb by TextDB.

the class RegexMatcher method setUp.

@Override
protected void setUp() throws DataflowException {
    if (inputOperator == null) {
        throw new DataflowException(ErrorMessages.INPUT_OPERATOR_NOT_SPECIFIED);
    }
    Schema inputSchema = inputOperator.getOutputSchema();
    this.addResultAttribute = predicate.getSpanListName() != null;
    Schema.checkAttributeExists(inputSchema, predicate.getAttributeNames());
    if (addResultAttribute) {
        Schema.checkAttributeNotExists(inputSchema, predicate.getSpanListName());
    }
    Schema.Builder outputSchemaBuilder = new Schema.Builder(inputOperator.getOutputSchema());
    if (addResultAttribute) {
        outputSchemaBuilder.add(predicate.getSpanListName(), AttributeType.LIST);
    }
    outputSchema = outputSchemaBuilder.build();
    findRegexType();
    // Check if labeled or unlabeled
    if (this.regexType == RegexType.NO_LABELS) {
        regexPattern = predicate.isIgnoreCase() ? Pattern.compile(predicate.getRegex(), Pattern.CASE_INSENSITIVE) : Pattern.compile(predicate.getRegex());
    } else if (this.regexType == RegexType.LABELED_WITH_QUALIFIERS) {
        labeledRegexProcessor = new LabeledRegexProcessor(predicate);
    } else {
        labledRegexNoQualifierProcessor = new LabledRegexNoQualifierProcessor(predicate);
    }
}
Also used : LabeledRegexProcessor(edu.uci.ics.texera.dataflow.regexmatcher.label.LabeledRegexProcessor) LabledRegexNoQualifierProcessor(edu.uci.ics.texera.dataflow.regexmatcher.label.LabledRegexNoQualifierProcessor) Schema(edu.uci.ics.texera.api.schema.Schema) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Aggregations

DataflowException (edu.uci.ics.texera.api.exception.DataflowException)56 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)23 AttributeType (edu.uci.ics.texera.api.schema.AttributeType)20 Schema (edu.uci.ics.texera.api.schema.Schema)20 Tuple (edu.uci.ics.texera.api.tuple.Tuple)18 IOException (java.io.IOException)14 Span (edu.uci.ics.texera.api.span.Span)11 Collectors (java.util.stream.Collectors)10 SchemaConstants (edu.uci.ics.texera.api.constants.SchemaConstants)9 ArrayList (java.util.ArrayList)9 Attribute (edu.uci.ics.texera.api.schema.Attribute)8 IOperator (edu.uci.ics.texera.api.dataflow.IOperator)7 IField (edu.uci.ics.texera.api.field.IField)7 ListField (edu.uci.ics.texera.api.field.ListField)7 List (java.util.List)7 AbstractSingleInputOperator (edu.uci.ics.texera.dataflow.common.AbstractSingleInputOperator)6 ErrorMessages (edu.uci.ics.texera.api.constants.ErrorMessages)5 StorageException (edu.uci.ics.texera.api.exception.StorageException)5 IntegerField (edu.uci.ics.texera.api.field.IntegerField)4 DataflowUtils (edu.uci.ics.texera.dataflow.utils.DataflowUtils)4