Search in sources :

Example 1 with KettleConversionException

use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.

the class CsvInput method processRow.

public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
    meta = (CsvInputMeta) smi;
    data = (CsvInputData) sdi;
    if (first) {
        first = false;
        data.outputRowMeta = new RowMeta();
        meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);
        if (data.filenames == null) {
            // We're expecting the list of filenames from the previous step(s)...
            // 
            getFilenamesFromPreviousSteps();
        }
        // We only run in parallel if we have at least one file to process
        // AND if we have more than one step copy running...
        // 
        data.parallel = meta.isRunningInParallel() && data.totalNumberOfSteps > 1;
        // The conversion logic for when the lazy conversion is turned of is simple:
        // Pretend it's a lazy conversion object anyway and get the native type during conversion.
        // 
        data.convertRowMeta = data.outputRowMeta.clone();
        for (ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList()) {
            valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_BINARY_STRING);
        }
        // Calculate the indexes for the filename and row number fields
        // 
        data.filenameFieldIndex = -1;
        if (!Utils.isEmpty(meta.getFilenameField()) && meta.isIncludingFilename()) {
            data.filenameFieldIndex = meta.getInputFields().length;
        }
        data.rownumFieldIndex = -1;
        if (!Utils.isEmpty(meta.getRowNumField())) {
            data.rownumFieldIndex = meta.getInputFields().length;
            if (data.filenameFieldIndex >= 0) {
                data.rownumFieldIndex++;
            }
        }
        // 
        if (data.parallel) {
            prepareToRunInParallel();
        }
        // 
        if (!openNextFile()) {
            setOutputDone();
            // nothing to see here, move along...
            return false;
        }
    }
    // 
    if (data.parallel) {
        if (data.totalBytesRead >= data.blockToRead) {
            // stop reading
            setOutputDone();
            return false;
        }
    }
    try {
        // get row, set busy!
        Object[] outputRowData = readOneRow(false, false);
        // no more input to be expected...
        if (outputRowData == null) {
            if (openNextFile()) {
                // try again on the next loop...
                return true;
            } else {
                // last file, end here
                setOutputDone();
                return false;
            }
        } else {
            // copy row to possible alternate rowset(s).
            putRow(data.outputRowMeta, outputRowData);
            if (checkFeedback(getLinesInput())) {
                if (log.isBasic()) {
                    logBasic(BaseMessages.getString(PKG, "CsvInput.Log.LineNumber", Long.toString(getLinesInput())));
                }
            }
        }
    } catch (KettleConversionException e) {
        if (getStepMeta().isDoingErrorHandling()) {
            StringBuilder errorDescriptions = new StringBuilder(100);
            StringBuilder errorFields = new StringBuilder(50);
            for (int i = 0; i < e.getCauses().size(); i++) {
                if (i > 0) {
                    errorDescriptions.append(", ");
                    errorFields.append(", ");
                }
                errorDescriptions.append(e.getCauses().get(i).getMessage());
                errorFields.append(e.getFields().get(i).toStringMeta());
            }
            putError(data.outputRowMeta, e.getRowData(), e.getCauses().size(), errorDescriptions.toString(), errorFields.toString(), "CSVINPUT001");
        } else {
            // 
            throw new KettleException(e.getMessage(), e.getCauses().get(0));
        }
    }
    return true;
}
Also used : KettleException(org.pentaho.di.core.exception.KettleException) RowMeta(org.pentaho.di.core.row.RowMeta) KettleConversionException(org.pentaho.di.core.exception.KettleConversionException) FileObject(org.apache.commons.vfs2.FileObject) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface)

Example 2 with KettleConversionException

use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.

the class CsvInput method readOneRow.

/**
 * Read a single row of data from the file...
 *
 * @param skipRow          if row should be skipped: header row or part of row in case of parallel read
 * @param ignoreEnclosures if enclosures should be ignored, i.e. in case of we need to skip part of the row during
 *                         parallel read
 * @return a row of data...
 * @throws KettleException
 */
private Object[] readOneRow(boolean skipRow, boolean ignoreEnclosures) throws KettleException {
    try {
        Object[] outputRowData = RowDataUtil.allocateRowData(data.outputRowMeta.size());
        int outputIndex = 0;
        boolean newLineFound = false;
        boolean endOfBuffer = false;
        List<Exception> conversionExceptions = null;
        List<ValueMetaInterface> exceptionFields = null;
        // 
        while (!newLineFound && outputIndex < data.fieldsMapping.size()) {
            if (data.resizeBufferIfNeeded()) {
                // there is no end of line delimiter
                if (outputRowData != null) {
                    // filling the rest of them with null
                    if (outputIndex > 0) {
                        // 
                        if (meta.isIncludingFilename() && !Utils.isEmpty(meta.getFilenameField())) {
                            if (meta.isLazyConversionActive()) {
                                outputRowData[data.filenameFieldIndex] = data.binaryFilename;
                            } else {
                                outputRowData[data.filenameFieldIndex] = data.filenames[data.filenr - 1];
                            }
                        }
                        if (data.isAddingRowNumber) {
                            outputRowData[data.rownumFieldIndex] = data.rowNumber++;
                        }
                        incrementLinesInput();
                        return outputRowData;
                    }
                }
                // nothing more to read, call it a day.
                return null;
            }
            // OK, at this point we should have data in the byteBuffer and we should be able to scan for the next
            // delimiter (;)
            // So let's look for a delimiter.
            // Also skip over the enclosures ("), it is NOT taking into account escaped enclosures.
            // Later we can add an option for having escaped or double enclosures in the file. <sigh>
            // 
            boolean delimiterFound = false;
            boolean enclosureFound = false;
            boolean doubleLineEnd = false;
            int escapedEnclosureFound = 0;
            boolean ignoreEnclosuresInField = ignoreEnclosures;
            while (!delimiterFound && !newLineFound && !endOfBuffer) {
                // 
                if (data.delimiterFound()) {
                    delimiterFound = true;
                } else if ((!meta.isNewlinePossibleInFields() || outputIndex == data.fieldsMapping.size() - 1) && data.newLineFound()) {
                    // Perhaps we found a (pre-mature) new line?
                    // 
                    // In case we are not using an enclosure and in case fields contain new lines
                    // we need to make sure that we check the newlines possible flag.
                    // If the flag is enable we skip newline checking except for the last field in the row.
                    // In that one we can't support newlines without enclosure (handled below).
                    // 
                    newLineFound = true;
                    // Skip new line character
                    for (int i = 0; i < data.encodingType.getLength(); i++) {
                        data.moveEndBufferPointer();
                    }
                    // Re-check for double new line (\r\n)...
                    if (data.newLineFound()) {
                        // Found another one, need to skip it later
                        doubleLineEnd = true;
                    }
                } else if (data.enclosureFound() && !ignoreEnclosuresInField) {
                    int enclosurePosition = data.getEndBuffer();
                    int fieldFirstBytePosition = data.getStartBuffer();
                    if (fieldFirstBytePosition == enclosurePosition) {
                        // Perhaps we need to skip over an enclosed part?
                        // We always expect exactly one enclosure character
                        // If we find the enclosure doubled, we consider it escaped.
                        // --> "" is converted to " later on.
                        // 
                        enclosureFound = true;
                        boolean keepGoing;
                        do {
                            if (data.moveEndBufferPointer()) {
                                enclosureFound = false;
                                break;
                            }
                            keepGoing = !data.enclosureFound();
                            if (!keepGoing) {
                                // Read another byte...
                                if (!data.endOfBuffer() && data.moveEndBufferPointer()) {
                                    break;
                                }
                                if (data.enclosure.length > 1) {
                                    data.moveEndBufferPointer();
                                }
                                // If this character is also an enclosure, we can consider the enclosure "escaped".
                                // As such, if this is an enclosure, we keep going...
                                // 
                                keepGoing = data.enclosureFound();
                                if (keepGoing) {
                                    escapedEnclosureFound++;
                                }
                            }
                        } while (keepGoing);
                        // 
                        if (data.endOfBuffer()) {
                            endOfBuffer = true;
                            break;
                        }
                    } else {
                        // Ignoring enclosure if it's not at the field start
                        ignoreEnclosuresInField = true;
                    }
                } else {
                    if (data.moveEndBufferPointer()) {
                        endOfBuffer = true;
                        break;
                    }
                }
            }
            // If we're still here, we found a delimiter...
            // Since the starting point never changed really, we just can grab range:
            // 
            // [startBuffer-endBuffer[
            // 
            // This is the part we want.
            // data.byteBuffer[data.startBuffer]
            // 
            byte[] field = data.getField(delimiterFound, enclosureFound, newLineFound, endOfBuffer);
            // 
            if (escapedEnclosureFound > 0) {
                if (log.isRowLevel()) {
                    logRowlevel("Escaped enclosures found in " + new String(field));
                }
                field = data.removeEscapedEnclosures(field, escapedEnclosureFound);
            }
            final int currentFieldIndex = outputIndex++;
            final int actualFieldIndex = data.fieldsMapping.fieldMetaIndex(currentFieldIndex);
            if (actualFieldIndex != FieldsMapping.FIELD_DOES_NOT_EXIST) {
                if (!skipRow) {
                    if (meta.isLazyConversionActive()) {
                        outputRowData[actualFieldIndex] = field;
                    } else {
                        // We're not lazy so we convert the data right here and now.
                        // The convert object uses binary storage as such we just have to ask the native type from it.
                        // That will do the actual conversion.
                        // 
                        ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(actualFieldIndex);
                        try {
                            outputRowData[actualFieldIndex] = sourceValueMeta.convertBinaryStringToNativeType(field);
                        } catch (KettleValueException e) {
                            // There was a conversion error,
                            // 
                            outputRowData[actualFieldIndex] = null;
                            if (conversionExceptions == null) {
                                conversionExceptions = new ArrayList<Exception>();
                                exceptionFields = new ArrayList<ValueMetaInterface>();
                            }
                            conversionExceptions.add(e);
                            exceptionFields.add(sourceValueMeta);
                        }
                    }
                } else {
                    // nothing for the header, no conversions here.
                    outputRowData[actualFieldIndex] = null;
                }
            }
            // empty column at the end of the row (see the Jira case for details)
            if ((!newLineFound && outputIndex < data.fieldsMapping.size()) || (newLineFound && doubleLineEnd)) {
                int i = 0;
                while ((!data.newLineFound() && (i < data.delimiter.length))) {
                    data.moveEndBufferPointer();
                    i++;
                }
                if (data.newLineFound() && outputIndex >= data.fieldsMapping.size()) {
                    data.moveEndBufferPointer();
                }
                if (doubleLineEnd && data.encodingType.getLength() > 1) {
                    data.moveEndBufferPointer();
                }
            }
            data.setStartBuffer(data.getEndBuffer());
        }
        // 
        if (!newLineFound && !data.resizeBufferIfNeeded()) {
            do {
                data.moveEndBufferPointer();
                if (data.resizeBufferIfNeeded()) {
                    // nothing more to read.
                    break;
                }
            // TODO: if we're using quoting we might be dealing with a very dirty file with quoted newlines in trailing
            // fields. (imagine that)
            // In that particular case we want to use the same logic we use above (refactored a bit) to skip these fields.
            } while (!data.newLineFound());
            if (!data.resizeBufferIfNeeded()) {
                while (data.newLineFound()) {
                    data.moveEndBufferPointer();
                    if (data.resizeBufferIfNeeded()) {
                        // nothing more to read.
                        break;
                    }
                }
            }
            // Make sure we start at the right position the next time around.
            data.setStartBuffer(data.getEndBuffer());
        }
        // 
        if (meta.isIncludingFilename() && !Utils.isEmpty(meta.getFilenameField())) {
            if (meta.isLazyConversionActive()) {
                outputRowData[data.filenameFieldIndex] = data.binaryFilename;
            } else {
                outputRowData[data.filenameFieldIndex] = data.filenames[data.filenr - 1];
            }
        }
        if (data.isAddingRowNumber) {
            outputRowData[data.rownumFieldIndex] = data.rowNumber++;
        }
        if (!ignoreEnclosures) {
            incrementLinesInput();
        }
        if (conversionExceptions != null && conversionExceptions.size() > 0) {
            // 
            throw new KettleConversionException("There were " + conversionExceptions.size() + " conversion errors on line " + getLinesInput(), conversionExceptions, exceptionFields, outputRowData);
        }
        return outputRowData;
    } catch (KettleConversionException e) {
        throw e;
    } catch (IOException e) {
        throw new KettleFileException("Exception reading line using NIO", e);
    }
}
Also used : KettleFileException(org.pentaho.di.core.exception.KettleFileException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) KettleException(org.pentaho.di.core.exception.KettleException) KettleFileException(org.pentaho.di.core.exception.KettleFileException) IOException(java.io.IOException) KettleValueException(org.pentaho.di.core.exception.KettleValueException) KettleConversionException(org.pentaho.di.core.exception.KettleConversionException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) KettleConversionException(org.pentaho.di.core.exception.KettleConversionException) FileObject(org.apache.commons.vfs2.FileObject) KettleValueException(org.pentaho.di.core.exception.KettleValueException)

Example 3 with KettleConversionException

use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.

the class SelectValues method metadataValues.

/**
 * Change the meta-data of certain fields.
 * <p/>
 * This, we can do VERY fast.
 * <p/>
 *
 * @param row The row to manipulate
 * @return the altered RowData array
 * @throws KettleValueException
 */
@VisibleForTesting
synchronized Object[] metadataValues(RowMetaInterface rowMeta, Object[] rowData) throws KettleException {
    if (data.firstmetadata) {
        data.firstmetadata = false;
        data.metanrs = new int[meta.getMeta().length];
        for (int i = 0; i < data.metanrs.length; i++) {
            data.metanrs[i] = rowMeta.indexOfValue(meta.getMeta()[i].getName());
            if (data.metanrs[i] < 0) {
                logError(BaseMessages.getString(PKG, "SelectValues.Log.CouldNotFindField", meta.getMeta()[i].getName()));
                setErrors(1);
                stopAll();
                return null;
            }
        }
        // Check for doubles in the selected fields...
        int[] cnt = new int[meta.getMeta().length];
        for (int i = 0; i < meta.getMeta().length; i++) {
            cnt[i] = 0;
            for (int j = 0; j < meta.getMeta().length; j++) {
                if (meta.getMeta()[i].getName().equals(meta.getMeta()[j].getName())) {
                    cnt[i]++;
                }
                if (cnt[i] > 1) {
                    logError(BaseMessages.getString(PKG, "SelectValues.Log.FieldCouldNotSpecifiedMoreThanTwice2", meta.getMeta()[i].getName()));
                    setErrors(1);
                    stopAll();
                    return null;
                }
            }
        }
        // 
        for (int i = 0; i < data.metanrs.length; i++) {
            SelectMetadataChange change = meta.getMeta()[i];
            ValueMetaInterface valueMeta = rowMeta.getValueMeta(data.metanrs[i]);
            if (!Utils.isEmpty(change.getConversionMask())) {
                valueMeta.setConversionMask(change.getConversionMask());
            }
            valueMeta.setDateFormatLenient(change.isDateFormatLenient());
            valueMeta.setDateFormatLocale(EnvUtil.createLocale(change.getDateFormatLocale()));
            valueMeta.setDateFormatTimeZone(EnvUtil.createTimeZone(change.getDateFormatTimeZone()));
            valueMeta.setLenientStringToNumber(change.isLenientStringToNumber());
            if (!Utils.isEmpty(change.getEncoding())) {
                valueMeta.setStringEncoding(change.getEncoding());
            }
            if (!Utils.isEmpty(change.getDecimalSymbol())) {
                valueMeta.setDecimalSymbol(change.getDecimalSymbol());
            }
            if (!Utils.isEmpty(change.getGroupingSymbol())) {
                valueMeta.setGroupingSymbol(change.getGroupingSymbol());
            }
            if (!Utils.isEmpty(change.getCurrencySymbol())) {
                valueMeta.setCurrencySymbol(change.getCurrencySymbol());
            }
        }
    }
    // 
    for (int i = 0; i < data.metanrs.length; i++) {
        int index = data.metanrs[i];
        ValueMetaInterface fromMeta = rowMeta.getValueMeta(index);
        ValueMetaInterface toMeta = data.metadataRowMeta.getValueMeta(index);
        // 
        try {
            if (fromMeta.isStorageBinaryString() && meta.getMeta()[i].getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL) {
                rowData[index] = fromMeta.convertBinaryStringToNativeType((byte[]) rowData[index]);
            }
            if (meta.getMeta()[i].getType() != ValueMetaInterface.TYPE_NONE && fromMeta.getType() != toMeta.getType()) {
                rowData[index] = toMeta.convertData(fromMeta, rowData[index]);
            }
        } catch (KettleValueException e) {
            throw new KettleConversionException(e.getMessage(), Collections.<Exception>singletonList(e), Collections.singletonList(toMeta), rowData);
        }
    }
    return rowData;
}
Also used : KettleConversionException(org.pentaho.di.core.exception.KettleConversionException) KettleValueException(org.pentaho.di.core.exception.KettleValueException) KettleException(org.pentaho.di.core.exception.KettleException) KettleValueException(org.pentaho.di.core.exception.KettleValueException) KettleConversionException(org.pentaho.di.core.exception.KettleConversionException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 4 with KettleConversionException

use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.

the class SelectValuesTest method errorRowSetObtainsFieldName.

@Test
public void errorRowSetObtainsFieldName() throws Exception {
    SelectValuesMeta stepMeta = new SelectValuesMeta();
    stepMeta.allocate(1, 0, 1);
    stepMeta.getSelectFields()[0] = new SelectField();
    stepMeta.getSelectFields()[0].setName(SELECTED_FIELD);
    stepMeta.getMeta()[0] = new SelectMetadataChange(stepMeta, SELECTED_FIELD, null, ValueMetaInterface.TYPE_INTEGER, -2, -2, ValueMetaInterface.STORAGE_TYPE_NORMAL, null, false, null, null, false, null, null, null);
    SelectValuesData stepData = new SelectValuesData();
    stepData.select = true;
    stepData.metadata = true;
    stepData.firstselect = true;
    stepData.firstmetadata = true;
    step.processRow(stepMeta, stepData);
    verify(step).putError(any(RowMetaInterface.class), any(Object[].class), anyLong(), anyString(), eq(SELECTED_FIELD), anyString());
    // additionally ensure conversion error causes KettleConversionError
    boolean properException = false;
    try {
        step.metadataValues(step.getInputRowMeta(), inputRow);
    } catch (KettleConversionException e) {
        properException = true;
    }
    assertTrue(properException);
}
Also used : SelectField(org.pentaho.di.trans.steps.selectvalues.SelectValuesMeta.SelectField) KettleConversionException(org.pentaho.di.core.exception.KettleConversionException) RowMetaInterface(org.pentaho.di.core.row.RowMetaInterface) Test(org.junit.Test)

Aggregations

KettleConversionException (org.pentaho.di.core.exception.KettleConversionException)4 KettleException (org.pentaho.di.core.exception.KettleException)3 ValueMetaInterface (org.pentaho.di.core.row.ValueMetaInterface)3 FileObject (org.apache.commons.vfs2.FileObject)2 KettleValueException (org.pentaho.di.core.exception.KettleValueException)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 IOException (java.io.IOException)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 Test (org.junit.Test)1 KettleFileException (org.pentaho.di.core.exception.KettleFileException)1 RowMeta (org.pentaho.di.core.row.RowMeta)1 RowMetaInterface (org.pentaho.di.core.row.RowMetaInterface)1 SelectField (org.pentaho.di.trans.steps.selectvalues.SelectValuesMeta.SelectField)1