Search in sources :

Example 1 with Document

use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.

the class TranslateOperation method compute.

void compute(final RowInput in, final RowOutput out, final ExecutionContext exec, final long rowCount) throws Exception {
    // Create a connection to the Translate service in the provided region
    final TranslateConnection conn = new TranslateConnection(m_cxnInfo);
    final AmazonTranslate translate = conn.getClient();
    int textColumnIdx = in.getDataTableSpec().findColumnIndex(m_textColumnName);
    long rowCounter = 0;
    // For each input row, grab the text column, make the call to Translate
    // and push the input plus the translation to the output.
    DataRow inputRow = null;
    while ((inputRow = in.poll()) != null) {
        // Check for cancel and update the row progress
        ++rowCounter;
        exec.checkCanceled();
        if (rowCount > 0) {
            exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
        }
        // Grab the text to evaluate
        String textValue = null;
        final DataCell cell = inputRow.getCell(textColumnIdx);
        // Create cells containing the output data.
        // Copy the input data to the output
        final int numInputColumns = inputRow.getNumCells();
        DataCell[] cells = Stream.generate(DataType::getMissingCell).limit(numInputColumns + 1).toArray(DataCell[]::new);
        for (int i = 0; i < numInputColumns; i++) {
            cells[i] = inputRow.getCell(i);
        }
        if (!cell.isMissing()) {
            if (cell.getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) cell).getDocument();
                textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
            } else {
                textValue = cell.toString();
            }
            final TranslateTextRequest request = new TranslateTextRequest().withText(textValue).withSourceLanguageCode(m_sourceLangCode).withTargetLanguageCode(m_targetLangCode);
            final TranslateTextResult result = translate.translateText(request);
            cells[numInputColumns] = new StringCell(result.getTranslatedText());
        }
        // Create a new data row and push it to the output container.
        out.push(new DefaultRow(inputRow.getKey(), cells));
    }
}
Also used : TranslateTextRequest(com.amazonaws.services.translate.model.TranslateTextRequest) Document(org.knime.ext.textprocessing.data.Document) DataRow(org.knime.core.data.DataRow) TranslateTextResult(com.amazonaws.services.translate.model.TranslateTextResult) DocumentValue(org.knime.ext.textprocessing.data.DocumentValue) AmazonTranslate(com.amazonaws.services.translate.AmazonTranslate) StringCell(org.knime.core.data.def.StringCell) DataCell(org.knime.core.data.DataCell) DataType(org.knime.core.data.DataType) DefaultRow(org.knime.core.data.def.DefaultRow)

Example 2 with Document

use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.

the class ComprehendTaggerOperation method compute.

@Override
public final void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
    // Create the tagger that uses the detect entities capability of the Comprehend
    // service.
    final DocumentTagger tagger = getTagger(comprehendClient, ComprehendUtils.LANG_MAP.getOrDefault(m_sourceLanguage, "en"), m_tokenizerName);
    final TextContainerDataCellFactory docCellFactory = TextContainerDataCellFactoryBuilder.createDocumentCellFactory();
    docCellFactory.prepare(FileStoreFactory.createFileStoreFactory(exec));
    long inputRowIndex = 0;
    long rowCounter = 0;
    // Tag each input document
    DataRow inputRow = null;
    while ((inputRow = in.poll()) != null) {
        // Check for cancel and update the row progress
        ++rowCounter;
        exec.checkCanceled();
        if (rowCount > 0) {
            exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
        }
        // Grab the text to evaluate
        final DataCell cell = inputRow.getCell(textColIdx);
        final DataCell newDataCell;
        if (!cell.isMissing()) {
            final Document outputDoc = tagger.tag(((DocumentValue) cell).getDocument());
            newDataCell = docCellFactory.createDataCell(outputDoc);
        } else {
            newDataCell = cell;
        }
        // Create cells containing the output data.
        // Copy the input data to the output
        final int numInputColumns = inputRow.getNumCells();
        final DataCell[] cells = m_newColName != null ? new DataCell[numInputColumns + 1] : new DataCell[numInputColumns];
        for (int i = 0; i < numInputColumns; i++) {
            cells[i] = inputRow.getCell(i);
        }
        // Copy the output document tagged with entities to the output
        cells[m_newColName != null ? numInputColumns : textColIdx] = newDataCell;
        // Create a new data row and push it to the output container.
        final RowKey key = new RowKey("Row " + inputRowIndex);
        final DataRow row = new DefaultRow(key, cells);
        out.push(row);
        ++inputRowIndex;
    }
}
Also used : RowKey(org.knime.core.data.RowKey) DataCell(org.knime.core.data.DataCell) DocumentTagger(org.knime.ext.textprocessing.nodes.tagging.DocumentTagger) TextContainerDataCellFactory(org.knime.ext.textprocessing.util.TextContainerDataCellFactory) Document(org.knime.ext.textprocessing.data.Document) DefaultRow(org.knime.core.data.def.DefaultRow) DataRow(org.knime.core.data.DataRow)

Example 3 with Document

use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.

the class KeyPhrasesOperation method compute.

@Override
public void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
    // Row index
    long rowCounter = 0;
    final int numInputColumns = in.getDataTableSpec().getNumColumns();
    // Create row batches based on global batch size and process one batch in one request
    final List<DataRow> rowBatch = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
    final List<String> texts = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
    final Set<Integer> validRows = new HashSet<>(ComprehendUtils.BATCH_SIZE);
    DataRow inputRow = null;
    while ((inputRow = in.poll()) != null) {
        // Check for cancel and update the row progress
        ++rowCounter;
        exec.checkCanceled();
        if (rowCount > 0) {
            exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
        }
        rowBatch.add(inputRow);
        final DataCell cell = inputRow.getCell(textColIdx);
        if (!cell.isMissing()) {
            String textValue = null;
            if (cell.getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) cell).getDocument();
                textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
            } else {
                textValue = cell.toString();
            }
            texts.add(textValue);
            validRows.add(rowBatch.size() - 1);
        }
        if (rowBatch.size() == ComprehendUtils.BATCH_SIZE) {
            processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
        }
    }
    // process remaining chunk
    processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
}
Also used : ArrayList(java.util.ArrayList) Document(org.knime.ext.textprocessing.data.Document) DataRow(org.knime.core.data.DataRow) DocumentValue(org.knime.ext.textprocessing.data.DocumentValue) DataCell(org.knime.core.data.DataCell) HashSet(java.util.HashSet)

Example 4 with Document

use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.

the class LanguageOperation method compute.

@Override
public void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
    long rowCounter = 0;
    final int numInputColumns = in.getDataTableSpec().getNumColumns();
    // Create row batches based on global batch size and process one batch in one request
    final List<DataRow> rowBatch = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
    final List<String> texts = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
    final Set<Integer> validRows = new HashSet<>(ComprehendUtils.BATCH_SIZE);
    DataRow inputRow = null;
    while ((inputRow = in.poll()) != null) {
        // Check for cancel and update the row progress
        ++rowCounter;
        exec.checkCanceled();
        if (rowCount > 0) {
            exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
        }
        rowBatch.add(inputRow);
        final DataCell cell = inputRow.getCell(textColIdx);
        if (!cell.isMissing()) {
            String textValue = null;
            if (cell.getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) cell).getDocument();
                textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
            } else {
                textValue = cell.toString();
            }
            texts.add(textValue);
            validRows.add(rowBatch.size() - 1);
        }
        if (rowBatch.size() == ComprehendUtils.BATCH_SIZE) {
            processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
        }
    }
    // process remaining chunk
    processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
}
Also used : ArrayList(java.util.ArrayList) Document(org.knime.ext.textprocessing.data.Document) DataRow(org.knime.core.data.DataRow) DocumentValue(org.knime.ext.textprocessing.data.DocumentValue) DataCell(org.knime.core.data.DataCell) HashSet(java.util.HashSet)

Example 5 with Document

use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.

the class SentimentOperation method compute.

@Override
public void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
    // Row index
    long rowCounter = 0;
    final int numInputColumns = in.getDataTableSpec().getNumColumns();
    // Create row batches based on global batch size and process one batch in one request
    final List<DataRow> rows = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
    final List<String> texts = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
    final Set<Integer> validRows = new HashSet<>(ComprehendUtils.BATCH_SIZE);
    DataRow inputRow = null;
    while ((inputRow = in.poll()) != null) {
        // Check for cancel and update the row progress
        ++rowCounter;
        exec.checkCanceled();
        if (rowCount > 0) {
            exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
        }
        rows.add(inputRow);
        final DataCell cell = inputRow.getCell(textColIdx);
        if (!cell.isMissing()) {
            String textValue = null;
            if (cell.getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) cell).getDocument();
                textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
            } else {
                textValue = cell.toString();
            }
            texts.add(textValue);
            validRows.add(rows.size() - 1);
        }
        if (rows.size() == ComprehendUtils.BATCH_SIZE) {
            processChunk(out, comprehendClient, numInputColumns, rows, texts, validRows);
        }
    }
    // process remaining chunk
    processChunk(out, comprehendClient, numInputColumns, rows, texts, validRows);
}
Also used : ArrayList(java.util.ArrayList) Document(org.knime.ext.textprocessing.data.Document) DataRow(org.knime.core.data.DataRow) DocumentValue(org.knime.ext.textprocessing.data.DocumentValue) DataCell(org.knime.core.data.DataCell) HashSet(java.util.HashSet)

Aggregations

DataCell (org.knime.core.data.DataCell)5 DataRow (org.knime.core.data.DataRow)5 Document (org.knime.ext.textprocessing.data.Document)5 DocumentValue (org.knime.ext.textprocessing.data.DocumentValue)4 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)3 DefaultRow (org.knime.core.data.def.DefaultRow)2 AmazonTranslate (com.amazonaws.services.translate.AmazonTranslate)1 TranslateTextRequest (com.amazonaws.services.translate.model.TranslateTextRequest)1 TranslateTextResult (com.amazonaws.services.translate.model.TranslateTextResult)1 DataType (org.knime.core.data.DataType)1 RowKey (org.knime.core.data.RowKey)1 StringCell (org.knime.core.data.def.StringCell)1 DocumentTagger (org.knime.ext.textprocessing.nodes.tagging.DocumentTagger)1 TextContainerDataCellFactory (org.knime.ext.textprocessing.util.TextContainerDataCellFactory)1