use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.
the class TranslateOperation method compute.
void compute(final RowInput in, final RowOutput out, final ExecutionContext exec, final long rowCount) throws Exception {
// Create a connection to the Translate service in the provided region
final TranslateConnection conn = new TranslateConnection(m_cxnInfo);
final AmazonTranslate translate = conn.getClient();
int textColumnIdx = in.getDataTableSpec().findColumnIndex(m_textColumnName);
long rowCounter = 0;
// For each input row, grab the text column, make the call to Translate
// and push the input plus the translation to the output.
DataRow inputRow = null;
while ((inputRow = in.poll()) != null) {
// Check for cancel and update the row progress
++rowCounter;
exec.checkCanceled();
if (rowCount > 0) {
exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
}
// Grab the text to evaluate
String textValue = null;
final DataCell cell = inputRow.getCell(textColumnIdx);
// Create cells containing the output data.
// Copy the input data to the output
final int numInputColumns = inputRow.getNumCells();
DataCell[] cells = Stream.generate(DataType::getMissingCell).limit(numInputColumns + 1).toArray(DataCell[]::new);
for (int i = 0; i < numInputColumns; i++) {
cells[i] = inputRow.getCell(i);
}
if (!cell.isMissing()) {
if (cell.getType().isCompatible(DocumentValue.class)) {
final Document doc = ((DocumentValue) cell).getDocument();
textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
} else {
textValue = cell.toString();
}
final TranslateTextRequest request = new TranslateTextRequest().withText(textValue).withSourceLanguageCode(m_sourceLangCode).withTargetLanguageCode(m_targetLangCode);
final TranslateTextResult result = translate.translateText(request);
cells[numInputColumns] = new StringCell(result.getTranslatedText());
}
// Create a new data row and push it to the output container.
out.push(new DefaultRow(inputRow.getKey(), cells));
}
}
use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.
the class ComprehendTaggerOperation method compute.
@Override
public final void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
// Create the tagger that uses the detect entities capability of the Comprehend
// service.
final DocumentTagger tagger = getTagger(comprehendClient, ComprehendUtils.LANG_MAP.getOrDefault(m_sourceLanguage, "en"), m_tokenizerName);
final TextContainerDataCellFactory docCellFactory = TextContainerDataCellFactoryBuilder.createDocumentCellFactory();
docCellFactory.prepare(FileStoreFactory.createFileStoreFactory(exec));
long inputRowIndex = 0;
long rowCounter = 0;
// Tag each input document
DataRow inputRow = null;
while ((inputRow = in.poll()) != null) {
// Check for cancel and update the row progress
++rowCounter;
exec.checkCanceled();
if (rowCount > 0) {
exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
}
// Grab the text to evaluate
final DataCell cell = inputRow.getCell(textColIdx);
final DataCell newDataCell;
if (!cell.isMissing()) {
final Document outputDoc = tagger.tag(((DocumentValue) cell).getDocument());
newDataCell = docCellFactory.createDataCell(outputDoc);
} else {
newDataCell = cell;
}
// Create cells containing the output data.
// Copy the input data to the output
final int numInputColumns = inputRow.getNumCells();
final DataCell[] cells = m_newColName != null ? new DataCell[numInputColumns + 1] : new DataCell[numInputColumns];
for (int i = 0; i < numInputColumns; i++) {
cells[i] = inputRow.getCell(i);
}
// Copy the output document tagged with entities to the output
cells[m_newColName != null ? numInputColumns : textColIdx] = newDataCell;
// Create a new data row and push it to the output container.
final RowKey key = new RowKey("Row " + inputRowIndex);
final DataRow row = new DefaultRow(key, cells);
out.push(row);
++inputRowIndex;
}
}
use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.
the class KeyPhrasesOperation method compute.
@Override
public void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
// Row index
long rowCounter = 0;
final int numInputColumns = in.getDataTableSpec().getNumColumns();
// Create row batches based on global batch size and process one batch in one request
final List<DataRow> rowBatch = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
final List<String> texts = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
final Set<Integer> validRows = new HashSet<>(ComprehendUtils.BATCH_SIZE);
DataRow inputRow = null;
while ((inputRow = in.poll()) != null) {
// Check for cancel and update the row progress
++rowCounter;
exec.checkCanceled();
if (rowCount > 0) {
exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
}
rowBatch.add(inputRow);
final DataCell cell = inputRow.getCell(textColIdx);
if (!cell.isMissing()) {
String textValue = null;
if (cell.getType().isCompatible(DocumentValue.class)) {
final Document doc = ((DocumentValue) cell).getDocument();
textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
} else {
textValue = cell.toString();
}
texts.add(textValue);
validRows.add(rowBatch.size() - 1);
}
if (rowBatch.size() == ComprehendUtils.BATCH_SIZE) {
processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
}
}
// process remaining chunk
processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
}
use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.
the class LanguageOperation method compute.
@Override
public void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
long rowCounter = 0;
final int numInputColumns = in.getDataTableSpec().getNumColumns();
// Create row batches based on global batch size and process one batch in one request
final List<DataRow> rowBatch = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
final List<String> texts = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
final Set<Integer> validRows = new HashSet<>(ComprehendUtils.BATCH_SIZE);
DataRow inputRow = null;
while ((inputRow = in.poll()) != null) {
// Check for cancel and update the row progress
++rowCounter;
exec.checkCanceled();
if (rowCount > 0) {
exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
}
rowBatch.add(inputRow);
final DataCell cell = inputRow.getCell(textColIdx);
if (!cell.isMissing()) {
String textValue = null;
if (cell.getType().isCompatible(DocumentValue.class)) {
final Document doc = ((DocumentValue) cell).getDocument();
textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
} else {
textValue = cell.toString();
}
texts.add(textValue);
validRows.add(rowBatch.size() - 1);
}
if (rowBatch.size() == ComprehendUtils.BATCH_SIZE) {
processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
}
}
// process remaining chunk
processChunk(out, comprehendClient, numInputColumns, rowBatch, texts, validRows);
}
use of org.knime.ext.textprocessing.data.Document in project knime-cloud by knime.
the class SentimentOperation method compute.
@Override
public void compute(final RowInput in, final RowOutput out, final AmazonComprehend comprehendClient, final int textColIdx, final ExecutionContext exec, final long rowCount) throws CanceledExecutionException, InterruptedException {
// Row index
long rowCounter = 0;
final int numInputColumns = in.getDataTableSpec().getNumColumns();
// Create row batches based on global batch size and process one batch in one request
final List<DataRow> rows = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
final List<String> texts = new ArrayList<>(ComprehendUtils.BATCH_SIZE);
final Set<Integer> validRows = new HashSet<>(ComprehendUtils.BATCH_SIZE);
DataRow inputRow = null;
while ((inputRow = in.poll()) != null) {
// Check for cancel and update the row progress
++rowCounter;
exec.checkCanceled();
if (rowCount > 0) {
exec.setProgress(rowCounter / (double) rowCount, "Processing row " + rowCounter + " of " + rowCount);
}
rows.add(inputRow);
final DataCell cell = inputRow.getCell(textColIdx);
if (!cell.isMissing()) {
String textValue = null;
if (cell.getType().isCompatible(DocumentValue.class)) {
final Document doc = ((DocumentValue) cell).getDocument();
textValue = doc.getTitle() + " " + doc.getDocumentBodyText();
} else {
textValue = cell.toString();
}
texts.add(textValue);
validRows.add(rows.size() - 1);
}
if (rows.size() == ComprehendUtils.BATCH_SIZE) {
processChunk(out, comprehendClient, numInputColumns, rows, texts, validRows);
}
}
// process remaining chunk
processChunk(out, comprehendClient, numInputColumns, rows, texts, validRows);
}
Aggregations