Search in sources :

Example 1 with StringEvaluationResult

use of org.pentaho.di.core.util.StringEvaluationResult in project pentaho-kettle by pentaho.

the class TextFileCSVImportProgressDialog method doScan.

private String doScan(IProgressMonitor monitor) throws KettleException {
    if (samples > 0) {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
    } else {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }
    String line = "";
    long fileLineNumber = 0;
    DecimalFormatSymbols dfs = new DecimalFormatSymbols();
    int nrfields = meta.inputFields.length;
    RowMetaInterface outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, transMeta, null, null);
    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (ValueMetaInterface valueMeta : outputRowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
    }
    RowMetaInterface convertRowMeta = outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);
    // How many null values?
    // How many times null value?
    int[] nrnull = new int[nrfields];
    // String info
    // min string
    String[] minstr = new String[nrfields];
    // max string
    String[] maxstr = new String[nrfields];
    // first occ. of string?
    boolean[] firststr = new boolean[nrfields];
    // Date info
    // is the field perhaps a Date?
    boolean[] isDate = new boolean[nrfields];
    // How many date formats work?
    int[] dateFormatCount = new int[nrfields];
    // What are the date formats that
    boolean[][] dateFormat = new boolean[nrfields][Const.getDateFormats().length];
    // work?
    // min date value
    Date[][] minDate = new Date[nrfields][Const.getDateFormats().length];
    // max date value
    Date[][] maxDate = new Date[nrfields][Const.getDateFormats().length];
    // Number info
    // is the field perhaps a Number?
    boolean[] isNumber = new boolean[nrfields];
    // How many number formats work?
    int[] numberFormatCount = new int[nrfields];
    // What are the number format
    boolean[][] numberFormat = new boolean[nrfields][Const.getNumberFormats().length];
    // that work?
    // min number value
    double[][] minValue = new double[nrfields][Const.getDateFormats().length];
    // max number value
    double[][] maxValue = new double[nrfields][Const.getDateFormats().length];
    // remember the precision?
    int[][] numberPrecision = new int[nrfields][Const.getNumberFormats().length];
    // remember the length?
    int[][] numberLength = new int[nrfields][Const.getNumberFormats().length];
    for (int i = 0; i < nrfields; i++) {
        BaseFileField field = meta.inputFields[i];
        if (log.isDebug()) {
            debug = "init field #" + i;
        }
        if (replaceMeta) {
            // Clear previous info...
            field.setName(meta.inputFields[i].getName());
            field.setType(meta.inputFields[i].getType());
            field.setFormat("");
            field.setLength(-1);
            field.setPrecision(-1);
            field.setCurrencySymbol(dfs.getCurrencySymbol());
            field.setDecimalSymbol("" + dfs.getDecimalSeparator());
            field.setGroupSymbol("" + dfs.getGroupingSeparator());
            field.setNullString("-");
            field.setTrimType(ValueMetaInterface.TRIM_TYPE_NONE);
        }
        nrnull[i] = 0;
        minstr[i] = "";
        maxstr[i] = "";
        firststr[i] = true;
        // Init data guess
        isDate[i] = true;
        for (int j = 0; j < Const.getDateFormats().length; j++) {
            dateFormat[i][j] = true;
            minDate[i][j] = Const.MAX_DATE;
            maxDate[i][j] = Const.MIN_DATE;
        }
        dateFormatCount[i] = Const.getDateFormats().length;
        // Init number guess
        isNumber[i] = true;
        for (int j = 0; j < Const.getNumberFormats().length; j++) {
            numberFormat[i][j] = true;
            minValue[i][j] = Double.MAX_VALUE;
            maxValue[i][j] = -Double.MAX_VALUE;
            numberPrecision[i][j] = -1;
            numberLength[i][j] = -1;
        }
        numberFormatCount[i] = Const.getNumberFormats().length;
    }
    TextFileInputMeta strinfo = (TextFileInputMeta) meta.clone();
    for (int i = 0; i < nrfields; i++) {
        strinfo.inputFields[i].setType(ValueMetaInterface.TYPE_STRING);
    }
    // Sample <samples> rows...
    debug = "get first line";
    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();
    // If the file has a header we overwrite the first line
    // However, if it doesn't have a header, take a new line
    // 
    line = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    fileLineNumber++;
    int skipped = 1;
    if (meta.content.header) {
        while (line != null && skipped < meta.content.nrHeaderLines) {
            line = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
            skipped++;
            fileLineNumber++;
        }
    }
    int linenr = 1;
    List<StringEvaluator> evaluators = new ArrayList<StringEvaluator>();
    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();
    boolean errorFound = false;
    while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
        monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
        if (samples > 0) {
            monitor.worked(1);
        }
        if (log.isDebug()) {
            debug = "convert line #" + linenr + " to row";
        }
        RowMetaInterface rowMeta = new RowMeta();
        meta.getFields(rowMeta, "stepname", null, null, transMeta, null, null);
        // Remove the storage meta-data (don't go for lazy conversion during scan)
        for (ValueMetaInterface valueMeta : rowMeta.getValueMetaList()) {
            valueMeta.setStorageMetadata(null);
            valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
        }
        String delimiter = transMeta.environmentSubstitute(meta.content.separator);
        String enclosure = transMeta.environmentSubstitute(meta.content.enclosure);
        String escapeCharacter = transMeta.environmentSubstitute(meta.content.escapeCharacter);
        Object[] r = TextFileInputUtils.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, FileInputList.createFilePathList(transMeta, meta.inputFiles.fileName, meta.inputFiles.fileMask, meta.inputFiles.excludeFileMask, meta.inputFiles.fileRequired, meta.inputFiles.includeSubFolderBoolean())[0], rownumber, delimiter, enclosure, escapeCharacter, null, new BaseFileInputAdditionalField(), null, null, false, null, null, null, null, null);
        if (r == null) {
            errorFound = true;
            continue;
        }
        rownumber++;
        for (int i = 0; i < nrfields && i < r.length; i++) {
            StringEvaluator evaluator;
            if (i >= evaluators.size()) {
                evaluator = new StringEvaluator(true);
                evaluators.add(evaluator);
            } else {
                evaluator = evaluators.get(i);
            }
            String string = rowMeta.getString(r, i);
            if (i == 0) {
                System.out.println();
            }
            evaluator.evaluateString(string);
        }
        fileLineNumber++;
        if (r != null) {
            linenr++;
        }
        // Grab another line...
        // 
        line = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    }
    monitor.worked(1);
    monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
    // Show information on items using a dialog box
    // 
    StringBuilder message = new StringBuilder();
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
    for (int i = 0; i < nrfields; i++) {
        BaseFileField field = meta.inputFields[i];
        StringEvaluator evaluator = evaluators.get(i);
        List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
        // 
        if (evaluationResults.isEmpty()) {
            field.setType(ValueMetaInterface.TYPE_STRING);
            field.setLength(evaluator.getMaxLength());
        } else {
            StringEvaluationResult result = evaluator.getAdvicedResult();
            if (result != null) {
                // Take the first option we find, list the others below...
                // 
                ValueMetaInterface conversionMeta = result.getConversionMeta();
                field.setType(conversionMeta.getType());
                field.setTrimType(conversionMeta.getTrimType());
                field.setFormat(conversionMeta.getConversionMask());
                field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
                field.setGroupSymbol(conversionMeta.getGroupingSymbol());
                field.setLength(conversionMeta.getLength());
                field.setPrecision(conversionMeta.getPrecision());
                nrnull[i] = result.getNrNull();
                minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
                maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
            }
        }
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
        switch(field.getType()) {
            case ValueMetaInterface.TYPE_NUMBER:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
                if (!evaluationResults.isEmpty()) {
                    if (evaluationResults.size() > 1) {
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
                    }
                    for (StringEvaluationResult seResult : evaluationResults) {
                        String mask = seResult.getConversionMeta().getConversionMask();
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
                        try {
                            df2.applyPattern(mask);
                            df2.setDecimalFormatSymbols(dfs2);
                            double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
                        } catch (Exception e) {
                            if (log.isDetailed()) {
                                log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
                break;
            case ValueMetaInterface.TYPE_STRING:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
                break;
            case ValueMetaInterface.TYPE_DATE:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
                if (dateFormatCount[i] > 1) {
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
                }
                if (!Utils.isEmpty(minstr[i])) {
                    for (int x = 0; x < Const.getDateFormats().length; x++) {
                        if (dateFormat[i][x]) {
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
                            Date mindate = minDate[i][x];
                            Date maxdate = maxDate[i][x];
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
                            daf2.applyPattern(Const.getDateFormats()[x]);
                            try {
                                Date md = daf2.parse(minstr[i]);
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
                            } catch (Exception e) {
                                if (log.isDetailed()) {
                                    log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
                                }
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
                break;
            default:
                break;
        }
        if (nrnull[i] == linenr - 1) {
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
        }
        message.append(Const.CR);
    }
    monitor.worked(1);
    monitor.done();
    return message.toString();
}
Also used : RowMeta(org.pentaho.di.core.row.RowMeta) DecimalFormat(java.text.DecimalFormat) BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) ArrayList(java.util.ArrayList) RowMetaInterface(org.pentaho.di.core.row.RowMetaInterface) TextFileLine(org.pentaho.di.trans.steps.fileinput.text.TextFileLine) StringEvaluationResult(org.pentaho.di.core.util.StringEvaluationResult) DecimalFormatSymbols(java.text.DecimalFormatSymbols) Date(java.util.Date) KettleException(org.pentaho.di.core.exception.KettleException) InvocationTargetException(java.lang.reflect.InvocationTargetException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) StringEvaluator(org.pentaho.di.core.util.StringEvaluator) TextFileInputMeta(org.pentaho.di.trans.steps.fileinput.text.TextFileInputMeta) BaseFileInputAdditionalField(org.pentaho.di.trans.steps.file.BaseFileInputAdditionalField) SimpleDateFormat(java.text.SimpleDateFormat)

Example 2 with StringEvaluationResult

use of org.pentaho.di.core.util.StringEvaluationResult in project data-access by pentaho.

the class CsvUtils method assumeColumnDetails.

protected void assumeColumnDetails(ColumnInfo profile, List<String> samples) {
    StringEvaluator eval = new StringEvaluator(false, NUMBER_FORMATS, ColumnInfo.DATE_FORMATS);
    for (String sample : samples) {
        eval.evaluateString(sample);
    }
    StringEvaluationResult result = eval.getAdvicedResult();
    ValueMetaInterface meta = result.getConversionMeta();
    int type = meta.getType();
    String mask = meta.getConversionMask();
    int size;
    int precision = meta.getPrecision();
    profile.setFormat(mask);
    profile.setPrecision(precision > 0 ? precision : 0);
    profile.setDataType(convertDataType(type));
    if (meta.isString()) {
        // pad the string lengths
        size = meta.getLength() + (meta.getLength() / 2);
    } else if (meta.isInteger()) {
        size = meta.getLength();
    } else {
        size = precision > 0 ? meta.getLength() : 0;
    }
    profile.setLength(size);
}
Also used : StringEvaluator(org.pentaho.di.core.util.StringEvaluator) StringEvaluationResult(org.pentaho.di.core.util.StringEvaluationResult) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface)

Example 3 with StringEvaluationResult

use of org.pentaho.di.core.util.StringEvaluationResult in project pentaho-kettle by pentaho.

the class TextFileCSVImportProgressDialog method doScan.

private String doScan(IProgressMonitor monitor) throws KettleException {
    if (samples > 0) {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
    } else {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }
    String line = "";
    long fileLineNumber = 0;
    DecimalFormatSymbols dfs = new DecimalFormatSymbols();
    int nrfields = meta.getInputFields().length;
    RowMetaInterface outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, transMeta, null, null);
    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (ValueMetaInterface valueMeta : outputRowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
    }
    RowMetaInterface convertRowMeta = outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);
    // How many null values?
    // How many times null value?
    int[] nrnull = new int[nrfields];
    // String info
    // min string
    String[] minstr = new String[nrfields];
    // max string
    String[] maxstr = new String[nrfields];
    // first occ. of string?
    boolean[] firststr = new boolean[nrfields];
    // Date info
    // is the field perhaps a Date?
    boolean[] isDate = new boolean[nrfields];
    // How many date formats work?
    int[] dateFormatCount = new int[nrfields];
    // What are the date formats that
    boolean[][] dateFormat = new boolean[nrfields][Const.getDateFormats().length];
    // work?
    // min date value
    Date[][] minDate = new Date[nrfields][Const.getDateFormats().length];
    // max date value
    Date[][] maxDate = new Date[nrfields][Const.getDateFormats().length];
    // Number info
    // is the field perhaps a Number?
    boolean[] isNumber = new boolean[nrfields];
    // How many number formats work?
    int[] numberFormatCount = new int[nrfields];
    // What are the number format
    boolean[][] numberFormat = new boolean[nrfields][Const.getNumberFormats().length];
    // that work?
    // min number value
    double[][] minValue = new double[nrfields][Const.getDateFormats().length];
    // max number value
    double[][] maxValue = new double[nrfields][Const.getDateFormats().length];
    // remember the precision?
    int[][] numberPrecision = new int[nrfields][Const.getNumberFormats().length];
    // remember the length?
    int[][] numberLength = new int[nrfields][Const.getNumberFormats().length];
    for (int i = 0; i < nrfields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        if (log.isDebug()) {
            debug = "init field #" + i;
        }
        if (replaceMeta) {
            // Clear previous info...
            field.setName(meta.getInputFields()[i].getName());
            field.setType(meta.getInputFields()[i].getType());
            field.setFormat("");
            field.setLength(-1);
            field.setPrecision(-1);
            field.setCurrencySymbol(dfs.getCurrencySymbol());
            field.setDecimalSymbol("" + dfs.getDecimalSeparator());
            field.setGroupSymbol("" + dfs.getGroupingSeparator());
            field.setNullString("-");
            field.setTrimType(ValueMetaInterface.TRIM_TYPE_NONE);
        }
        nrnull[i] = 0;
        minstr[i] = "";
        maxstr[i] = "";
        firststr[i] = true;
        // Init data guess
        isDate[i] = true;
        for (int j = 0; j < Const.getDateFormats().length; j++) {
            dateFormat[i][j] = true;
            minDate[i][j] = Const.MAX_DATE;
            maxDate[i][j] = Const.MIN_DATE;
        }
        dateFormatCount[i] = Const.getDateFormats().length;
        // Init number guess
        isNumber[i] = true;
        for (int j = 0; j < Const.getNumberFormats().length; j++) {
            numberFormat[i][j] = true;
            minValue[i][j] = Double.MAX_VALUE;
            maxValue[i][j] = -Double.MAX_VALUE;
            numberPrecision[i][j] = -1;
            numberLength[i][j] = -1;
        }
        numberFormatCount[i] = Const.getNumberFormats().length;
    }
    InputFileMetaInterface strinfo = (InputFileMetaInterface) meta.clone();
    for (int i = 0; i < nrfields; i++) {
        strinfo.getInputFields()[i].setType(ValueMetaInterface.TYPE_STRING);
    }
    // Sample <samples> rows...
    debug = "get first line";
    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();
    // If the file has a header we overwrite the first line
    // However, if it doesn't have a header, take a new line
    // 
    line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    fileLineNumber++;
    int skipped = 1;
    if (meta.hasHeader()) {
        while (line != null && skipped < meta.getNrHeaderLines()) {
            line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
            skipped++;
            fileLineNumber++;
        }
    }
    int linenr = 1;
    List<StringEvaluator> evaluators = new ArrayList<StringEvaluator>();
    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();
    boolean errorFound = false;
    while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
        monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
        if (samples > 0) {
            monitor.worked(1);
        }
        if (log.isDebug()) {
            debug = "convert line #" + linenr + " to row";
        }
        RowMetaInterface rowMeta = new RowMeta();
        meta.getFields(rowMeta, "stepname", null, null, transMeta, null, null);
        // Remove the storage meta-data (don't go for lazy conversion during scan)
        for (ValueMetaInterface valueMeta : rowMeta.getValueMetaList()) {
            valueMeta.setStorageMetadata(null);
            valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
        }
        String delimiter = transMeta.environmentSubstitute(meta.getSeparator());
        String enclosure = transMeta.environmentSubstitute(meta.getEnclosure());
        String escapeCharacter = transMeta.environmentSubstitute(meta.getEscapeCharacter());
        Object[] r = TextFileInput.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, meta.getFilePaths(transMeta)[0], rownumber, delimiter, enclosure, escapeCharacter, null, false, false, false, false, false, false, false, false, null, null, false, null, null, null, null, 0);
        if (r == null) {
            errorFound = true;
            continue;
        }
        rownumber++;
        for (int i = 0; i < nrfields && i < r.length; i++) {
            StringEvaluator evaluator;
            if (i >= evaluators.size()) {
                evaluator = new StringEvaluator(true);
                evaluators.add(evaluator);
            } else {
                evaluator = evaluators.get(i);
            }
            String string = rowMeta.getString(r, i);
            if (i == 0) {
                System.out.println();
            }
            evaluator.evaluateString(string);
        }
        fileLineNumber++;
        if (r != null) {
            linenr++;
        }
        // Grab another line...
        // 
        line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    }
    monitor.worked(1);
    monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
    // Show information on items using a dialog box
    // 
    StringBuilder message = new StringBuilder();
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
    for (int i = 0; i < nrfields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        StringEvaluator evaluator = evaluators.get(i);
        List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
        // If we didn't find any matching result, it's a String...
        // 
        StringEvaluationResult result = evaluator.getAdvicedResult();
        if (evaluationResults.isEmpty()) {
            field.setType(ValueMetaInterface.TYPE_STRING);
            field.setLength(evaluator.getMaxLength());
        }
        if (result != null) {
            // Take the first option we find, list the others below...
            // 
            ValueMetaInterface conversionMeta = result.getConversionMeta();
            field.setType(conversionMeta.getType());
            field.setTrimType(conversionMeta.getTrimType());
            field.setFormat(conversionMeta.getConversionMask());
            field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
            field.setGroupSymbol(conversionMeta.getGroupingSymbol());
            field.setLength(conversionMeta.getLength());
            field.setPrecision(conversionMeta.getPrecision());
            nrnull[i] = result.getNrNull();
            minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
            maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
        }
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
        switch(field.getType()) {
            case ValueMetaInterface.TYPE_NUMBER:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
                if (!evaluationResults.isEmpty()) {
                    if (evaluationResults.size() > 1) {
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
                    }
                    for (StringEvaluationResult seResult : evaluationResults) {
                        String mask = seResult.getConversionMeta().getConversionMask();
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
                        try {
                            df2.applyPattern(mask);
                            df2.setDecimalFormatSymbols(dfs2);
                            double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
                        } catch (Exception e) {
                            if (log.isDetailed()) {
                                log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
                break;
            case ValueMetaInterface.TYPE_STRING:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
                break;
            case ValueMetaInterface.TYPE_DATE:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
                if (dateFormatCount[i] > 1) {
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
                }
                if (!Utils.isEmpty(minstr[i])) {
                    for (int x = 0; x < Const.getDateFormats().length; x++) {
                        if (dateFormat[i][x]) {
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
                            Date mindate = minDate[i][x];
                            Date maxdate = maxDate[i][x];
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
                            daf2.applyPattern(Const.getDateFormats()[x]);
                            try {
                                Date md = daf2.parse(minstr[i]);
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
                            } catch (Exception e) {
                                if (log.isDetailed()) {
                                    log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
                                }
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
                break;
            default:
                break;
        }
        if (nrnull[i] == linenr - 1) {
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
        }
        message.append(Const.CR);
    }
    monitor.worked(1);
    monitor.done();
    return message.toString();
}
Also used : RowMeta(org.pentaho.di.core.row.RowMeta) DecimalFormat(java.text.DecimalFormat) TextFileInputField(org.pentaho.di.trans.steps.textfileinput.TextFileInputField) ArrayList(java.util.ArrayList) RowMetaInterface(org.pentaho.di.core.row.RowMetaInterface) TextFileLine(org.pentaho.di.trans.steps.textfileinput.TextFileLine) StringEvaluationResult(org.pentaho.di.core.util.StringEvaluationResult) InputFileMetaInterface(org.pentaho.di.trans.steps.textfileinput.InputFileMetaInterface) DecimalFormatSymbols(java.text.DecimalFormatSymbols) Date(java.util.Date) KettleException(org.pentaho.di.core.exception.KettleException) InvocationTargetException(java.lang.reflect.InvocationTargetException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) StringEvaluator(org.pentaho.di.core.util.StringEvaluator) SimpleDateFormat(java.text.SimpleDateFormat)

Example 4 with StringEvaluationResult

use of org.pentaho.di.core.util.StringEvaluationResult in project pentaho-kettle by pentaho.

the class TextFileCSVImportProgressDialog method doScan.

private String doScan(IProgressMonitor monitor, final boolean failOnParseError) throws KettleException {
    if (samples > 0) {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
    } else {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }
    String line = "";
    StringBuilder sbLine = new StringBuilder();
    long fileLineNumber = 0;
    DecimalFormatSymbols dfs = new DecimalFormatSymbols();
    int nrfields = meta.inputFields.length;
    RowMetaInterface outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, transMeta, null, null);
    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (ValueMetaInterface valueMeta : outputRowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
    }
    RowMetaInterface convertRowMeta = outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);
    // How many null values?
    // How many times null value?
    int[] nrnull = new int[nrfields];
    // String info
    // min string
    String[] minstr = new String[nrfields];
    // max string
    String[] maxstr = new String[nrfields];
    // first occ. of string?
    boolean[] firststr = new boolean[nrfields];
    // Date info
    // is the field perhaps a Date?
    boolean[] isDate = new boolean[nrfields];
    // How many date formats work?
    int[] dateFormatCount = new int[nrfields];
    // What are the date formats that
    boolean[][] dateFormat = new boolean[nrfields][Const.getDateFormats().length];
    // work?
    // min date value
    Date[][] minDate = new Date[nrfields][Const.getDateFormats().length];
    // max date value
    Date[][] maxDate = new Date[nrfields][Const.getDateFormats().length];
    // Number info
    // is the field perhaps a Number?
    boolean[] isNumber = new boolean[nrfields];
    // How many number formats work?
    int[] numberFormatCount = new int[nrfields];
    // What are the number format
    boolean[][] numberFormat = new boolean[nrfields][Const.getNumberFormats().length];
    // that work?
    // min number value
    double[][] minValue = new double[nrfields][Const.getDateFormats().length];
    // max number value
    double[][] maxValue = new double[nrfields][Const.getDateFormats().length];
    // remember the precision?
    int[][] numberPrecision = new int[nrfields][Const.getNumberFormats().length];
    // remember the length?
    int[][] numberLength = new int[nrfields][Const.getNumberFormats().length];
    for (int i = 0; i < nrfields; i++) {
        BaseFileField field = meta.inputFields[i];
        if (log.isDebug()) {
            debug = "init field #" + i;
        }
        if (replaceMeta) {
            // Clear previous info...
            field.setName(meta.inputFields[i].getName());
            field.setType(meta.inputFields[i].getType());
            field.setFormat("");
            field.setLength(-1);
            field.setPrecision(-1);
            field.setCurrencySymbol(dfs.getCurrencySymbol());
            field.setDecimalSymbol("" + dfs.getDecimalSeparator());
            field.setGroupSymbol("" + dfs.getGroupingSeparator());
            field.setNullString("-");
            field.setTrimType(ValueMetaInterface.TRIM_TYPE_NONE);
        }
        nrnull[i] = 0;
        minstr[i] = "";
        maxstr[i] = "";
        firststr[i] = true;
        // Init data guess
        isDate[i] = true;
        for (int j = 0; j < Const.getDateFormats().length; j++) {
            dateFormat[i][j] = true;
            minDate[i][j] = Const.MAX_DATE;
            maxDate[i][j] = Const.MIN_DATE;
        }
        dateFormatCount[i] = Const.getDateFormats().length;
        // Init number guess
        isNumber[i] = true;
        for (int j = 0; j < Const.getNumberFormats().length; j++) {
            numberFormat[i][j] = true;
            minValue[i][j] = Double.MAX_VALUE;
            maxValue[i][j] = -Double.MAX_VALUE;
            numberPrecision[i][j] = -1;
            numberLength[i][j] = -1;
        }
        numberFormatCount[i] = Const.getNumberFormats().length;
    }
    TextFileInputMeta strinfo = (TextFileInputMeta) meta.clone();
    for (int i = 0; i < nrfields; i++) {
        strinfo.inputFields[i].setType(ValueMetaInterface.TYPE_STRING);
    }
    // Sample <samples> rows...
    debug = "get first line";
    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();
    if (meta.content.header) {
        fileLineNumber = TextFileInputUtils.skipLines(log, reader, encodingType, fileFormatType, lineBuffer, meta.content.nrHeaderLines, meta.getEnclosure(), meta.getEscapeCharacter(), fileLineNumber);
    }
    // Reading the first line of data
    line = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer, meta.getEnclosure(), meta.getEscapeCharacter());
    int linenr = 1;
    List<StringEvaluator> evaluators = new ArrayList<StringEvaluator>();
    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();
    boolean errorFound = false;
    while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
        monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
        if (samples > 0) {
            monitor.worked(1);
        }
        if (log.isDebug()) {
            debug = "convert line #" + linenr + " to row";
        }
        RowMetaInterface rowMeta = new RowMeta();
        meta.getFields(rowMeta, "stepname", null, null, transMeta, null, null);
        // Remove the storage meta-data (don't go for lazy conversion during scan)
        for (ValueMetaInterface valueMeta : rowMeta.getValueMetaList()) {
            valueMeta.setStorageMetadata(null);
            valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
        }
        String delimiter = transMeta.environmentSubstitute(meta.content.separator);
        String enclosure = transMeta.environmentSubstitute(meta.content.enclosure);
        String escapeCharacter = transMeta.environmentSubstitute(meta.content.escapeCharacter);
        Object[] r = TextFileInputUtils.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, FileInputList.createFilePathList(transMeta, meta.inputFiles.fileName, meta.inputFiles.fileMask, meta.inputFiles.excludeFileMask, meta.inputFiles.fileRequired, meta.inputFiles.includeSubFolderBoolean())[0], rownumber, delimiter, enclosure, escapeCharacter, null, new BaseFileInputAdditionalField(), null, null, false, null, null, null, null, null, failOnParseError);
        if (r == null) {
            errorFound = true;
            continue;
        }
        rownumber++;
        for (int i = 0; i < nrfields && i < r.length; i++) {
            StringEvaluator evaluator;
            if (i >= evaluators.size()) {
                evaluator = new StringEvaluator(true);
                evaluators.add(evaluator);
            } else {
                evaluator = evaluators.get(i);
            }
            String string = getStringFromRow(rowMeta, r, i, failOnParseError);
            evaluator.evaluateString(string);
        }
        if (r != null) {
            linenr++;
        }
        // Grab another line...
        TextFileLine textFileLine = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer, enclosure, escapeCharacter, fileLineNumber);
        line = textFileLine.getLine();
        fileLineNumber = textFileLine.getLineNumber();
    }
    monitor.worked(1);
    monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
    // Show information on items using a dialog box
    // 
    StringBuilder message = new StringBuilder();
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
    if (nrfields == evaluators.size()) {
        for (int i = 0; i < nrfields; i++) {
            BaseFileField field = meta.inputFields[i];
            StringEvaluator evaluator = evaluators.get(i);
            List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
            // 
            if (evaluationResults.isEmpty()) {
                field.setType(ValueMetaInterface.TYPE_STRING);
                field.setLength(evaluator.getMaxLength());
            } else {
                StringEvaluationResult result = evaluator.getAdvicedResult();
                if (result != null) {
                    // Take the first option we find, list the others below...
                    // 
                    ValueMetaInterface conversionMeta = result.getConversionMeta();
                    field.setType(conversionMeta.getType());
                    field.setTrimType(conversionMeta.getTrimType());
                    field.setFormat(conversionMeta.getConversionMask());
                    field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
                    field.setGroupSymbol(conversionMeta.getGroupingSymbol());
                    field.setLength(conversionMeta.getLength());
                    field.setPrecision(conversionMeta.getPrecision());
                    nrnull[i] = result.getNrNull();
                    minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
                    maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
                }
            }
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
            switch(field.getType()) {
                case ValueMetaInterface.TYPE_NUMBER:
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
                    if (!evaluationResults.isEmpty()) {
                        if (evaluationResults.size() > 1) {
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
                        }
                        for (StringEvaluationResult seResult : evaluationResults) {
                            String mask = seResult.getConversionMeta().getConversionMask();
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
                            try {
                                df2.applyPattern(mask);
                                df2.setDecimalFormatSymbols(dfs2);
                                double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
                            } catch (Exception e) {
                                if (log.isDetailed()) {
                                    log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
                                }
                            }
                        }
                    }
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
                    break;
                case ValueMetaInterface.TYPE_STRING:
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
                    break;
                case ValueMetaInterface.TYPE_DATE:
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
                    if (dateFormatCount[i] > 1) {
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
                    }
                    if (!Utils.isEmpty(minstr[i])) {
                        for (int x = 0; x < Const.getDateFormats().length; x++) {
                            if (dateFormat[i][x]) {
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
                                Date mindate = minDate[i][x];
                                Date maxdate = maxDate[i][x];
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
                                daf2.applyPattern(Const.getDateFormats()[x]);
                                try {
                                    Date md = daf2.parse(minstr[i]);
                                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
                                } catch (Exception e) {
                                    if (log.isDetailed()) {
                                        log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
                                    }
                                }
                            }
                        }
                    }
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
                    break;
                default:
                    break;
            }
            if (nrnull[i] == linenr - 1) {
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
            }
            message.append(Const.CR);
        }
    }
    monitor.worked(1);
    monitor.done();
    return message.toString();
}
Also used : RowMeta(org.pentaho.di.core.row.RowMeta) DecimalFormat(java.text.DecimalFormat) BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) ArrayList(java.util.ArrayList) RowMetaInterface(org.pentaho.di.core.row.RowMetaInterface) TextFileLine(org.pentaho.di.trans.steps.fileinput.text.TextFileLine) StringEvaluationResult(org.pentaho.di.core.util.StringEvaluationResult) DecimalFormatSymbols(java.text.DecimalFormatSymbols) Date(java.util.Date) KettleException(org.pentaho.di.core.exception.KettleException) InvocationTargetException(java.lang.reflect.InvocationTargetException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) StringEvaluator(org.pentaho.di.core.util.StringEvaluator) TextFileInputMeta(org.pentaho.di.trans.steps.fileinput.text.TextFileInputMeta) BaseFileInputAdditionalField(org.pentaho.di.trans.steps.file.BaseFileInputAdditionalField) SimpleDateFormat(java.text.SimpleDateFormat)

Example 5 with StringEvaluationResult

use of org.pentaho.di.core.util.StringEvaluationResult in project pentaho-kettle by pentaho.

the class TextFileCSVImportProgressDialog method doScan.

private String doScan(IProgressMonitor monitor, final boolean failOnParseError) throws KettleException {
    if (samples > 0) {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
    } else {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }
    String line = "";
    long fileLineNumber = 0;
    DecimalFormatSymbols dfs = new DecimalFormatSymbols();
    int nrfields = meta.getInputFields().length;
    RowMetaInterface outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, transMeta, null, null);
    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (ValueMetaInterface valueMeta : outputRowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
    }
    RowMetaInterface convertRowMeta = outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);
    // How many null values?
    // How many times null value?
    int[] nrnull = new int[nrfields];
    // String info
    // min string
    String[] minstr = new String[nrfields];
    // max string
    String[] maxstr = new String[nrfields];
    // first occ. of string?
    boolean[] firststr = new boolean[nrfields];
    // Date info
    // is the field perhaps a Date?
    boolean[] isDate = new boolean[nrfields];
    // How many date formats work?
    int[] dateFormatCount = new int[nrfields];
    // What are the date formats that
    boolean[][] dateFormat = new boolean[nrfields][Const.getDateFormats().length];
    // work?
    // min date value
    Date[][] minDate = new Date[nrfields][Const.getDateFormats().length];
    // max date value
    Date[][] maxDate = new Date[nrfields][Const.getDateFormats().length];
    // Number info
    // is the field perhaps a Number?
    boolean[] isNumber = new boolean[nrfields];
    // How many number formats work?
    int[] numberFormatCount = new int[nrfields];
    // What are the number format
    boolean[][] numberFormat = new boolean[nrfields][Const.getNumberFormats().length];
    // that work?
    // min number value
    double[][] minValue = new double[nrfields][Const.getDateFormats().length];
    // max number value
    double[][] maxValue = new double[nrfields][Const.getDateFormats().length];
    // remember the precision?
    int[][] numberPrecision = new int[nrfields][Const.getNumberFormats().length];
    // remember the length?
    int[][] numberLength = new int[nrfields][Const.getNumberFormats().length];
    for (int i = 0; i < nrfields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        if (log.isDebug()) {
            debug = "init field #" + i;
        }
        if (replaceMeta) {
            // Clear previous info...
            field.setName(meta.getInputFields()[i].getName());
            field.setType(meta.getInputFields()[i].getType());
            field.setFormat("");
            field.setLength(-1);
            field.setPrecision(-1);
            field.setCurrencySymbol(dfs.getCurrencySymbol());
            field.setDecimalSymbol("" + dfs.getDecimalSeparator());
            field.setGroupSymbol("" + dfs.getGroupingSeparator());
            field.setNullString("-");
            field.setTrimType(ValueMetaInterface.TRIM_TYPE_NONE);
        }
        nrnull[i] = 0;
        minstr[i] = "";
        maxstr[i] = "";
        firststr[i] = true;
        // Init data guess
        isDate[i] = true;
        for (int j = 0; j < Const.getDateFormats().length; j++) {
            dateFormat[i][j] = true;
            minDate[i][j] = Const.MAX_DATE;
            maxDate[i][j] = Const.MIN_DATE;
        }
        dateFormatCount[i] = Const.getDateFormats().length;
        // Init number guess
        isNumber[i] = true;
        for (int j = 0; j < Const.getNumberFormats().length; j++) {
            numberFormat[i][j] = true;
            minValue[i][j] = Double.MAX_VALUE;
            maxValue[i][j] = -Double.MAX_VALUE;
            numberPrecision[i][j] = -1;
            numberLength[i][j] = -1;
        }
        numberFormatCount[i] = Const.getNumberFormats().length;
    }
    InputFileMetaInterface strinfo = (InputFileMetaInterface) meta.clone();
    for (int i = 0; i < nrfields; i++) {
        strinfo.getInputFields()[i].setType(ValueMetaInterface.TYPE_STRING);
    }
    // Sample <samples> rows...
    debug = "get first line";
    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();
    if (meta.hasHeader()) {
        fileLineNumber = TextFileInputUtils.skipLines(log, reader, encodingType, fileFormatType, lineBuffer, meta.getNrHeaderLines(), meta.getEnclosure(), meta.getEscapeCharacter(), fileLineNumber);
    }
    // Reading the first line of data
    line = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer, meta.getEnclosure(), meta.getEscapeCharacter());
    int linenr = 1;
    List<StringEvaluator> evaluators = new ArrayList<StringEvaluator>();
    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();
    boolean errorFound = false;
    while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
        monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
        if (samples > 0) {
            monitor.worked(1);
        }
        if (log.isDebug()) {
            debug = "convert line #" + linenr + " to row";
        }
        RowMetaInterface rowMeta = new RowMeta();
        meta.getFields(rowMeta, "stepname", null, null, transMeta, null, null);
        // Remove the storage meta-data (don't go for lazy conversion during scan)
        for (ValueMetaInterface valueMeta : rowMeta.getValueMetaList()) {
            valueMeta.setStorageMetadata(null);
            valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
        }
        String delimiter = transMeta.environmentSubstitute(meta.getSeparator());
        String enclosure = transMeta.environmentSubstitute(meta.getEnclosure());
        String escapeCharacter = transMeta.environmentSubstitute(meta.getEscapeCharacter());
        Object[] r = TextFileInput.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, meta.getFilePaths(transMeta)[0], rownumber, delimiter, enclosure, escapeCharacter, null, false, false, false, false, false, false, false, false, null, null, false, null, null, null, null, 0, failOnParseError);
        if (r == null) {
            errorFound = true;
            continue;
        }
        rownumber++;
        for (int i = 0; i < nrfields && i < r.length; i++) {
            StringEvaluator evaluator;
            if (i >= evaluators.size()) {
                evaluator = new StringEvaluator(true);
                evaluators.add(evaluator);
            } else {
                evaluator = evaluators.get(i);
            }
            String string = getStringFromRow(rowMeta, r, i, failOnParseError);
            evaluator.evaluateString(string);
        }
        if (r != null) {
            linenr++;
        }
        // Grab another line...
        TextFileLine textFileLine = TextFileInputUtils.getLine(log, reader, encodingType, fileFormatType, lineBuffer, enclosure, escapeCharacter, fileLineNumber);
        line = textFileLine.getLine();
        fileLineNumber = textFileLine.getLineNumber();
    }
    monitor.worked(1);
    monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
    // Show information on items using a dialog box
    // 
    StringBuilder message = new StringBuilder();
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
    if (nrfields == evaluators.size()) {
        for (int i = 0; i < nrfields; i++) {
            TextFileInputField field = meta.getInputFields()[i];
            StringEvaluator evaluator = evaluators.get(i);
            List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
            // If we didn't find any matching result, it's a String...
            // 
            StringEvaluationResult result = evaluator.getAdvicedResult();
            if (evaluationResults.isEmpty()) {
                field.setType(ValueMetaInterface.TYPE_STRING);
                field.setLength(evaluator.getMaxLength());
            }
            if (result != null) {
                // Take the first option we find, list the others below...
                // 
                ValueMetaInterface conversionMeta = result.getConversionMeta();
                field.setType(conversionMeta.getType());
                field.setTrimType(conversionMeta.getTrimType());
                field.setFormat(conversionMeta.getConversionMask());
                field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
                field.setGroupSymbol(conversionMeta.getGroupingSymbol());
                field.setLength(conversionMeta.getLength());
                field.setPrecision(conversionMeta.getPrecision());
                nrnull[i] = result.getNrNull();
                minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
                maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
            }
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
            switch(field.getType()) {
                case ValueMetaInterface.TYPE_NUMBER:
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
                    if (!evaluationResults.isEmpty()) {
                        if (evaluationResults.size() > 1) {
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
                        }
                        for (StringEvaluationResult seResult : evaluationResults) {
                            String mask = seResult.getConversionMeta().getConversionMask();
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
                            try {
                                df2.applyPattern(mask);
                                df2.setDecimalFormatSymbols(dfs2);
                                double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
                            } catch (Exception e) {
                                if (log.isDetailed()) {
                                    log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
                                }
                            }
                        }
                    }
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
                    break;
                case ValueMetaInterface.TYPE_STRING:
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
                    break;
                case ValueMetaInterface.TYPE_DATE:
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
                    if (dateFormatCount[i] > 1) {
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
                    }
                    if (!Utils.isEmpty(minstr[i])) {
                        for (int x = 0; x < Const.getDateFormats().length; x++) {
                            if (dateFormat[i][x]) {
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
                                Date mindate = minDate[i][x];
                                Date maxdate = maxDate[i][x];
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
                                daf2.applyPattern(Const.getDateFormats()[x]);
                                try {
                                    Date md = daf2.parse(minstr[i]);
                                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
                                } catch (Exception e) {
                                    if (log.isDetailed()) {
                                        log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
                                    }
                                }
                            }
                        }
                    }
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
                    break;
                default:
                    break;
            }
            if (nrnull[i] == linenr - 1) {
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
            }
            message.append(Const.CR);
        }
    }
    monitor.worked(1);
    monitor.done();
    return message.toString();
}
Also used : RowMeta(org.pentaho.di.core.row.RowMeta) DecimalFormat(java.text.DecimalFormat) TextFileInputField(org.pentaho.di.trans.steps.textfileinput.TextFileInputField) ArrayList(java.util.ArrayList) RowMetaInterface(org.pentaho.di.core.row.RowMetaInterface) TextFileLine(org.pentaho.di.trans.steps.fileinput.text.TextFileLine) StringEvaluationResult(org.pentaho.di.core.util.StringEvaluationResult) InputFileMetaInterface(org.pentaho.di.trans.steps.textfileinput.InputFileMetaInterface) DecimalFormatSymbols(java.text.DecimalFormatSymbols) Date(java.util.Date) KettleException(org.pentaho.di.core.exception.KettleException) InvocationTargetException(java.lang.reflect.InvocationTargetException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) StringEvaluator(org.pentaho.di.core.util.StringEvaluator) SimpleDateFormat(java.text.SimpleDateFormat)

Aggregations

ValueMetaInterface (org.pentaho.di.core.row.ValueMetaInterface)5 StringEvaluationResult (org.pentaho.di.core.util.StringEvaluationResult)5 StringEvaluator (org.pentaho.di.core.util.StringEvaluator)5 InvocationTargetException (java.lang.reflect.InvocationTargetException)4 DecimalFormat (java.text.DecimalFormat)4 DecimalFormatSymbols (java.text.DecimalFormatSymbols)4 SimpleDateFormat (java.text.SimpleDateFormat)4 ArrayList (java.util.ArrayList)4 Date (java.util.Date)4 KettleException (org.pentaho.di.core.exception.KettleException)4 RowMeta (org.pentaho.di.core.row.RowMeta)4 RowMetaInterface (org.pentaho.di.core.row.RowMetaInterface)4 TextFileLine (org.pentaho.di.trans.steps.fileinput.text.TextFileLine)3 BaseFileField (org.pentaho.di.trans.steps.file.BaseFileField)2 BaseFileInputAdditionalField (org.pentaho.di.trans.steps.file.BaseFileInputAdditionalField)2 TextFileInputMeta (org.pentaho.di.trans.steps.fileinput.text.TextFileInputMeta)2 InputFileMetaInterface (org.pentaho.di.trans.steps.textfileinput.InputFileMetaInterface)2 TextFileInputField (org.pentaho.di.trans.steps.textfileinput.TextFileInputField)2 TextFileLine (org.pentaho.di.trans.steps.textfileinput.TextFileLine)1