use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.
the class CsvInput method processRow.
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
meta = (CsvInputMeta) smi;
data = (CsvInputData) sdi;
if (first) {
first = false;
data.outputRowMeta = new RowMeta();
meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);
if (data.filenames == null) {
// We're expecting the list of filenames from the previous step(s)...
//
getFilenamesFromPreviousSteps();
}
// We only run in parallel if we have at least one file to process
// AND if we have more than one step copy running...
//
data.parallel = meta.isRunningInParallel() && data.totalNumberOfSteps > 1;
// The conversion logic for when the lazy conversion is turned of is simple:
// Pretend it's a lazy conversion object anyway and get the native type during conversion.
//
data.convertRowMeta = data.outputRowMeta.clone();
for (ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList()) {
valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_BINARY_STRING);
}
// Calculate the indexes for the filename and row number fields
//
data.filenameFieldIndex = -1;
if (!Utils.isEmpty(meta.getFilenameField()) && meta.isIncludingFilename()) {
data.filenameFieldIndex = meta.getInputFields().length;
}
data.rownumFieldIndex = -1;
if (!Utils.isEmpty(meta.getRowNumField())) {
data.rownumFieldIndex = meta.getInputFields().length;
if (data.filenameFieldIndex >= 0) {
data.rownumFieldIndex++;
}
}
//
if (data.parallel) {
prepareToRunInParallel();
}
//
if (!openNextFile()) {
setOutputDone();
// nothing to see here, move along...
return false;
}
}
//
if (data.parallel) {
if (data.totalBytesRead >= data.blockToRead) {
// stop reading
setOutputDone();
return false;
}
}
try {
// get row, set busy!
Object[] outputRowData = readOneRow(false, false);
// no more input to be expected...
if (outputRowData == null) {
if (openNextFile()) {
// try again on the next loop...
return true;
} else {
// last file, end here
setOutputDone();
return false;
}
} else {
// copy row to possible alternate rowset(s).
putRow(data.outputRowMeta, outputRowData);
if (checkFeedback(getLinesInput())) {
if (log.isBasic()) {
logBasic(BaseMessages.getString(PKG, "CsvInput.Log.LineNumber", Long.toString(getLinesInput())));
}
}
}
} catch (KettleConversionException e) {
if (getStepMeta().isDoingErrorHandling()) {
StringBuilder errorDescriptions = new StringBuilder(100);
StringBuilder errorFields = new StringBuilder(50);
for (int i = 0; i < e.getCauses().size(); i++) {
if (i > 0) {
errorDescriptions.append(", ");
errorFields.append(", ");
}
errorDescriptions.append(e.getCauses().get(i).getMessage());
errorFields.append(e.getFields().get(i).toStringMeta());
}
putError(data.outputRowMeta, e.getRowData(), e.getCauses().size(), errorDescriptions.toString(), errorFields.toString(), "CSVINPUT001");
} else {
//
throw new KettleException(e.getMessage(), e.getCauses().get(0));
}
}
return true;
}
use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.
the class CsvInput method readOneRow.
/**
* Read a single row of data from the file...
*
* @param skipRow if row should be skipped: header row or part of row in case of parallel read
* @param ignoreEnclosures if enclosures should be ignored, i.e. in case of we need to skip part of the row during
* parallel read
* @return a row of data...
* @throws KettleException
*/
private Object[] readOneRow(boolean skipRow, boolean ignoreEnclosures) throws KettleException {
try {
Object[] outputRowData = RowDataUtil.allocateRowData(data.outputRowMeta.size());
int outputIndex = 0;
boolean newLineFound = false;
boolean endOfBuffer = false;
List<Exception> conversionExceptions = null;
List<ValueMetaInterface> exceptionFields = null;
//
while (!newLineFound && outputIndex < data.fieldsMapping.size()) {
if (data.resizeBufferIfNeeded()) {
// there is no end of line delimiter
if (outputRowData != null) {
// filling the rest of them with null
if (outputIndex > 0) {
//
if (meta.isIncludingFilename() && !Utils.isEmpty(meta.getFilenameField())) {
if (meta.isLazyConversionActive()) {
outputRowData[data.filenameFieldIndex] = data.binaryFilename;
} else {
outputRowData[data.filenameFieldIndex] = data.filenames[data.filenr - 1];
}
}
if (data.isAddingRowNumber) {
outputRowData[data.rownumFieldIndex] = data.rowNumber++;
}
incrementLinesInput();
return outputRowData;
}
}
// nothing more to read, call it a day.
return null;
}
// OK, at this point we should have data in the byteBuffer and we should be able to scan for the next
// delimiter (;)
// So let's look for a delimiter.
// Also skip over the enclosures ("), it is NOT taking into account escaped enclosures.
// Later we can add an option for having escaped or double enclosures in the file. <sigh>
//
boolean delimiterFound = false;
boolean enclosureFound = false;
boolean doubleLineEnd = false;
int escapedEnclosureFound = 0;
boolean ignoreEnclosuresInField = ignoreEnclosures;
while (!delimiterFound && !newLineFound && !endOfBuffer) {
//
if (data.delimiterFound()) {
delimiterFound = true;
} else if ((!meta.isNewlinePossibleInFields() || outputIndex == data.fieldsMapping.size() - 1) && data.newLineFound()) {
// Perhaps we found a (pre-mature) new line?
//
// In case we are not using an enclosure and in case fields contain new lines
// we need to make sure that we check the newlines possible flag.
// If the flag is enable we skip newline checking except for the last field in the row.
// In that one we can't support newlines without enclosure (handled below).
//
newLineFound = true;
// Skip new line character
for (int i = 0; i < data.encodingType.getLength(); i++) {
data.moveEndBufferPointer();
}
// Re-check for double new line (\r\n)...
if (data.newLineFound()) {
// Found another one, need to skip it later
doubleLineEnd = true;
}
} else if (data.enclosureFound() && !ignoreEnclosuresInField) {
int enclosurePosition = data.getEndBuffer();
int fieldFirstBytePosition = data.getStartBuffer();
if (fieldFirstBytePosition == enclosurePosition) {
// Perhaps we need to skip over an enclosed part?
// We always expect exactly one enclosure character
// If we find the enclosure doubled, we consider it escaped.
// --> "" is converted to " later on.
//
enclosureFound = true;
boolean keepGoing;
do {
if (data.moveEndBufferPointer()) {
enclosureFound = false;
break;
}
keepGoing = !data.enclosureFound();
if (!keepGoing) {
// Read another byte...
if (!data.endOfBuffer() && data.moveEndBufferPointer()) {
break;
}
if (data.enclosure.length > 1) {
data.moveEndBufferPointer();
}
// If this character is also an enclosure, we can consider the enclosure "escaped".
// As such, if this is an enclosure, we keep going...
//
keepGoing = data.enclosureFound();
if (keepGoing) {
escapedEnclosureFound++;
}
}
} while (keepGoing);
//
if (data.endOfBuffer()) {
endOfBuffer = true;
break;
}
} else {
// Ignoring enclosure if it's not at the field start
ignoreEnclosuresInField = true;
}
} else {
if (data.moveEndBufferPointer()) {
endOfBuffer = true;
break;
}
}
}
// If we're still here, we found a delimiter...
// Since the starting point never changed really, we just can grab range:
//
// [startBuffer-endBuffer[
//
// This is the part we want.
// data.byteBuffer[data.startBuffer]
//
byte[] field = data.getField(delimiterFound, enclosureFound, newLineFound, endOfBuffer);
//
if (escapedEnclosureFound > 0) {
if (log.isRowLevel()) {
logRowlevel("Escaped enclosures found in " + new String(field));
}
field = data.removeEscapedEnclosures(field, escapedEnclosureFound);
}
final int currentFieldIndex = outputIndex++;
final int actualFieldIndex = data.fieldsMapping.fieldMetaIndex(currentFieldIndex);
if (actualFieldIndex != FieldsMapping.FIELD_DOES_NOT_EXIST) {
if (!skipRow) {
if (meta.isLazyConversionActive()) {
outputRowData[actualFieldIndex] = field;
} else {
// We're not lazy so we convert the data right here and now.
// The convert object uses binary storage as such we just have to ask the native type from it.
// That will do the actual conversion.
//
ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(actualFieldIndex);
try {
outputRowData[actualFieldIndex] = sourceValueMeta.convertBinaryStringToNativeType(field);
} catch (KettleValueException e) {
// There was a conversion error,
//
outputRowData[actualFieldIndex] = null;
if (conversionExceptions == null) {
conversionExceptions = new ArrayList<Exception>();
exceptionFields = new ArrayList<ValueMetaInterface>();
}
conversionExceptions.add(e);
exceptionFields.add(sourceValueMeta);
}
}
} else {
// nothing for the header, no conversions here.
outputRowData[actualFieldIndex] = null;
}
}
// empty column at the end of the row (see the Jira case for details)
if ((!newLineFound && outputIndex < data.fieldsMapping.size()) || (newLineFound && doubleLineEnd)) {
int i = 0;
while ((!data.newLineFound() && (i < data.delimiter.length))) {
data.moveEndBufferPointer();
i++;
}
if (data.newLineFound() && outputIndex >= data.fieldsMapping.size()) {
data.moveEndBufferPointer();
}
if (doubleLineEnd && data.encodingType.getLength() > 1) {
data.moveEndBufferPointer();
}
}
data.setStartBuffer(data.getEndBuffer());
}
//
if (!newLineFound && !data.resizeBufferIfNeeded()) {
do {
data.moveEndBufferPointer();
if (data.resizeBufferIfNeeded()) {
// nothing more to read.
break;
}
// TODO: if we're using quoting we might be dealing with a very dirty file with quoted newlines in trailing
// fields. (imagine that)
// In that particular case we want to use the same logic we use above (refactored a bit) to skip these fields.
} while (!data.newLineFound());
if (!data.resizeBufferIfNeeded()) {
while (data.newLineFound()) {
data.moveEndBufferPointer();
if (data.resizeBufferIfNeeded()) {
// nothing more to read.
break;
}
}
}
// Make sure we start at the right position the next time around.
data.setStartBuffer(data.getEndBuffer());
}
//
if (meta.isIncludingFilename() && !Utils.isEmpty(meta.getFilenameField())) {
if (meta.isLazyConversionActive()) {
outputRowData[data.filenameFieldIndex] = data.binaryFilename;
} else {
outputRowData[data.filenameFieldIndex] = data.filenames[data.filenr - 1];
}
}
if (data.isAddingRowNumber) {
outputRowData[data.rownumFieldIndex] = data.rowNumber++;
}
if (!ignoreEnclosures) {
incrementLinesInput();
}
if (conversionExceptions != null && conversionExceptions.size() > 0) {
//
throw new KettleConversionException("There were " + conversionExceptions.size() + " conversion errors on line " + getLinesInput(), conversionExceptions, exceptionFields, outputRowData);
}
return outputRowData;
} catch (KettleConversionException e) {
throw e;
} catch (IOException e) {
throw new KettleFileException("Exception reading line using NIO", e);
}
}
use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.
the class SelectValues method metadataValues.
/**
* Change the meta-data of certain fields.
* <p/>
* This, we can do VERY fast.
* <p/>
*
* @param row The row to manipulate
* @return the altered RowData array
* @throws KettleValueException
*/
@VisibleForTesting
synchronized Object[] metadataValues(RowMetaInterface rowMeta, Object[] rowData) throws KettleException {
if (data.firstmetadata) {
data.firstmetadata = false;
data.metanrs = new int[meta.getMeta().length];
for (int i = 0; i < data.metanrs.length; i++) {
data.metanrs[i] = rowMeta.indexOfValue(meta.getMeta()[i].getName());
if (data.metanrs[i] < 0) {
logError(BaseMessages.getString(PKG, "SelectValues.Log.CouldNotFindField", meta.getMeta()[i].getName()));
setErrors(1);
stopAll();
return null;
}
}
// Check for doubles in the selected fields...
int[] cnt = new int[meta.getMeta().length];
for (int i = 0; i < meta.getMeta().length; i++) {
cnt[i] = 0;
for (int j = 0; j < meta.getMeta().length; j++) {
if (meta.getMeta()[i].getName().equals(meta.getMeta()[j].getName())) {
cnt[i]++;
}
if (cnt[i] > 1) {
logError(BaseMessages.getString(PKG, "SelectValues.Log.FieldCouldNotSpecifiedMoreThanTwice2", meta.getMeta()[i].getName()));
setErrors(1);
stopAll();
return null;
}
}
}
//
for (int i = 0; i < data.metanrs.length; i++) {
SelectMetadataChange change = meta.getMeta()[i];
ValueMetaInterface valueMeta = rowMeta.getValueMeta(data.metanrs[i]);
if (!Utils.isEmpty(change.getConversionMask())) {
valueMeta.setConversionMask(change.getConversionMask());
}
valueMeta.setDateFormatLenient(change.isDateFormatLenient());
valueMeta.setDateFormatLocale(EnvUtil.createLocale(change.getDateFormatLocale()));
valueMeta.setDateFormatTimeZone(EnvUtil.createTimeZone(change.getDateFormatTimeZone()));
valueMeta.setLenientStringToNumber(change.isLenientStringToNumber());
if (!Utils.isEmpty(change.getEncoding())) {
valueMeta.setStringEncoding(change.getEncoding());
}
if (!Utils.isEmpty(change.getDecimalSymbol())) {
valueMeta.setDecimalSymbol(change.getDecimalSymbol());
}
if (!Utils.isEmpty(change.getGroupingSymbol())) {
valueMeta.setGroupingSymbol(change.getGroupingSymbol());
}
if (!Utils.isEmpty(change.getCurrencySymbol())) {
valueMeta.setCurrencySymbol(change.getCurrencySymbol());
}
}
}
//
for (int i = 0; i < data.metanrs.length; i++) {
int index = data.metanrs[i];
ValueMetaInterface fromMeta = rowMeta.getValueMeta(index);
ValueMetaInterface toMeta = data.metadataRowMeta.getValueMeta(index);
//
try {
if (fromMeta.isStorageBinaryString() && meta.getMeta()[i].getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL) {
rowData[index] = fromMeta.convertBinaryStringToNativeType((byte[]) rowData[index]);
}
if (meta.getMeta()[i].getType() != ValueMetaInterface.TYPE_NONE && fromMeta.getType() != toMeta.getType()) {
rowData[index] = toMeta.convertData(fromMeta, rowData[index]);
}
} catch (KettleValueException e) {
throw new KettleConversionException(e.getMessage(), Collections.<Exception>singletonList(e), Collections.singletonList(toMeta), rowData);
}
}
return rowData;
}
use of org.pentaho.di.core.exception.KettleConversionException in project pentaho-kettle by pentaho.
the class SelectValuesTest method errorRowSetObtainsFieldName.
@Test
public void errorRowSetObtainsFieldName() throws Exception {
SelectValuesMeta stepMeta = new SelectValuesMeta();
stepMeta.allocate(1, 0, 1);
stepMeta.getSelectFields()[0] = new SelectField();
stepMeta.getSelectFields()[0].setName(SELECTED_FIELD);
stepMeta.getMeta()[0] = new SelectMetadataChange(stepMeta, SELECTED_FIELD, null, ValueMetaInterface.TYPE_INTEGER, -2, -2, ValueMetaInterface.STORAGE_TYPE_NORMAL, null, false, null, null, false, null, null, null);
SelectValuesData stepData = new SelectValuesData();
stepData.select = true;
stepData.metadata = true;
stepData.firstselect = true;
stepData.firstmetadata = true;
step.processRow(stepMeta, stepData);
verify(step).putError(any(RowMetaInterface.class), any(Object[].class), anyLong(), anyString(), eq(SELECTED_FIELD), anyString());
// additionally ensure conversion error causes KettleConversionError
boolean properException = false;
try {
step.metadataValues(step.getInputRowMeta(), inputRow);
} catch (KettleConversionException e) {
properException = true;
}
assertTrue(properException);
}
Aggregations