Search in sources :

Example 1 with TextFileLine

use of org.pentaho.di.trans.steps.textfileinput.TextFileLine in project pentaho-kettle by pentaho.

the class TextFileCSVImportProgressDialog method doScan.

private String doScan(IProgressMonitor monitor) throws KettleException {
    if (samples > 0) {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
    } else {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }
    String line = "";
    long fileLineNumber = 0;
    DecimalFormatSymbols dfs = new DecimalFormatSymbols();
    int nrfields = meta.getInputFields().length;
    RowMetaInterface outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, transMeta, null, null);
    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (ValueMetaInterface valueMeta : outputRowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
    }
    RowMetaInterface convertRowMeta = outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);
    // How many null values?
    // How many times null value?
    int[] nrnull = new int[nrfields];
    // String info
    // min string
    String[] minstr = new String[nrfields];
    // max string
    String[] maxstr = new String[nrfields];
    // first occ. of string?
    boolean[] firststr = new boolean[nrfields];
    // Date info
    // is the field perhaps a Date?
    boolean[] isDate = new boolean[nrfields];
    // How many date formats work?
    int[] dateFormatCount = new int[nrfields];
    // What are the date formats that
    boolean[][] dateFormat = new boolean[nrfields][Const.getDateFormats().length];
    // work?
    // min date value
    Date[][] minDate = new Date[nrfields][Const.getDateFormats().length];
    // max date value
    Date[][] maxDate = new Date[nrfields][Const.getDateFormats().length];
    // Number info
    // is the field perhaps a Number?
    boolean[] isNumber = new boolean[nrfields];
    // How many number formats work?
    int[] numberFormatCount = new int[nrfields];
    // What are the number format
    boolean[][] numberFormat = new boolean[nrfields][Const.getNumberFormats().length];
    // that work?
    // min number value
    double[][] minValue = new double[nrfields][Const.getDateFormats().length];
    // max number value
    double[][] maxValue = new double[nrfields][Const.getDateFormats().length];
    // remember the precision?
    int[][] numberPrecision = new int[nrfields][Const.getNumberFormats().length];
    // remember the length?
    int[][] numberLength = new int[nrfields][Const.getNumberFormats().length];
    for (int i = 0; i < nrfields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        if (log.isDebug()) {
            debug = "init field #" + i;
        }
        if (replaceMeta) {
            // Clear previous info...
            field.setName(meta.getInputFields()[i].getName());
            field.setType(meta.getInputFields()[i].getType());
            field.setFormat("");
            field.setLength(-1);
            field.setPrecision(-1);
            field.setCurrencySymbol(dfs.getCurrencySymbol());
            field.setDecimalSymbol("" + dfs.getDecimalSeparator());
            field.setGroupSymbol("" + dfs.getGroupingSeparator());
            field.setNullString("-");
            field.setTrimType(ValueMetaInterface.TRIM_TYPE_NONE);
        }
        nrnull[i] = 0;
        minstr[i] = "";
        maxstr[i] = "";
        firststr[i] = true;
        // Init data guess
        isDate[i] = true;
        for (int j = 0; j < Const.getDateFormats().length; j++) {
            dateFormat[i][j] = true;
            minDate[i][j] = Const.MAX_DATE;
            maxDate[i][j] = Const.MIN_DATE;
        }
        dateFormatCount[i] = Const.getDateFormats().length;
        // Init number guess
        isNumber[i] = true;
        for (int j = 0; j < Const.getNumberFormats().length; j++) {
            numberFormat[i][j] = true;
            minValue[i][j] = Double.MAX_VALUE;
            maxValue[i][j] = -Double.MAX_VALUE;
            numberPrecision[i][j] = -1;
            numberLength[i][j] = -1;
        }
        numberFormatCount[i] = Const.getNumberFormats().length;
    }
    InputFileMetaInterface strinfo = (InputFileMetaInterface) meta.clone();
    for (int i = 0; i < nrfields; i++) {
        strinfo.getInputFields()[i].setType(ValueMetaInterface.TYPE_STRING);
    }
    // Sample <samples> rows...
    debug = "get first line";
    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();
    // If the file has a header we overwrite the first line
    // However, if it doesn't have a header, take a new line
    // 
    line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    fileLineNumber++;
    int skipped = 1;
    if (meta.hasHeader()) {
        while (line != null && skipped < meta.getNrHeaderLines()) {
            line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
            skipped++;
            fileLineNumber++;
        }
    }
    int linenr = 1;
    List<StringEvaluator> evaluators = new ArrayList<StringEvaluator>();
    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();
    boolean errorFound = false;
    while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
        monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
        if (samples > 0) {
            monitor.worked(1);
        }
        if (log.isDebug()) {
            debug = "convert line #" + linenr + " to row";
        }
        RowMetaInterface rowMeta = new RowMeta();
        meta.getFields(rowMeta, "stepname", null, null, transMeta, null, null);
        // Remove the storage meta-data (don't go for lazy conversion during scan)
        for (ValueMetaInterface valueMeta : rowMeta.getValueMetaList()) {
            valueMeta.setStorageMetadata(null);
            valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);
        }
        String delimiter = transMeta.environmentSubstitute(meta.getSeparator());
        String enclosure = transMeta.environmentSubstitute(meta.getEnclosure());
        String escapeCharacter = transMeta.environmentSubstitute(meta.getEscapeCharacter());
        Object[] r = TextFileInput.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, meta.getFilePaths(transMeta)[0], rownumber, delimiter, enclosure, escapeCharacter, null, false, false, false, false, false, false, false, false, null, null, false, null, null, null, null, 0);
        if (r == null) {
            errorFound = true;
            continue;
        }
        rownumber++;
        for (int i = 0; i < nrfields && i < r.length; i++) {
            StringEvaluator evaluator;
            if (i >= evaluators.size()) {
                evaluator = new StringEvaluator(true);
                evaluators.add(evaluator);
            } else {
                evaluator = evaluators.get(i);
            }
            String string = rowMeta.getString(r, i);
            if (i == 0) {
                System.out.println();
            }
            evaluator.evaluateString(string);
        }
        fileLineNumber++;
        if (r != null) {
            linenr++;
        }
        // Grab another line...
        // 
        line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    }
    monitor.worked(1);
    monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
    // Show information on items using a dialog box
    // 
    StringBuilder message = new StringBuilder();
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
    for (int i = 0; i < nrfields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        StringEvaluator evaluator = evaluators.get(i);
        List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
        // If we didn't find any matching result, it's a String...
        // 
        StringEvaluationResult result = evaluator.getAdvicedResult();
        if (evaluationResults.isEmpty()) {
            field.setType(ValueMetaInterface.TYPE_STRING);
            field.setLength(evaluator.getMaxLength());
        }
        if (result != null) {
            // Take the first option we find, list the others below...
            // 
            ValueMetaInterface conversionMeta = result.getConversionMeta();
            field.setType(conversionMeta.getType());
            field.setTrimType(conversionMeta.getTrimType());
            field.setFormat(conversionMeta.getConversionMask());
            field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
            field.setGroupSymbol(conversionMeta.getGroupingSymbol());
            field.setLength(conversionMeta.getLength());
            field.setPrecision(conversionMeta.getPrecision());
            nrnull[i] = result.getNrNull();
            minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
            maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
        }
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
        switch(field.getType()) {
            case ValueMetaInterface.TYPE_NUMBER:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
                if (!evaluationResults.isEmpty()) {
                    if (evaluationResults.size() > 1) {
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
                    }
                    for (StringEvaluationResult seResult : evaluationResults) {
                        String mask = seResult.getConversionMeta().getConversionMask();
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
                        try {
                            df2.applyPattern(mask);
                            df2.setDecimalFormatSymbols(dfs2);
                            double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
                        } catch (Exception e) {
                            if (log.isDetailed()) {
                                log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
                break;
            case ValueMetaInterface.TYPE_STRING:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
                break;
            case ValueMetaInterface.TYPE_DATE:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
                if (dateFormatCount[i] > 1) {
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
                }
                if (!Utils.isEmpty(minstr[i])) {
                    for (int x = 0; x < Const.getDateFormats().length; x++) {
                        if (dateFormat[i][x]) {
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
                            Date mindate = minDate[i][x];
                            Date maxdate = maxDate[i][x];
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
                            daf2.applyPattern(Const.getDateFormats()[x]);
                            try {
                                Date md = daf2.parse(minstr[i]);
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
                            } catch (Exception e) {
                                if (log.isDetailed()) {
                                    log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
                                }
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
                break;
            default:
                break;
        }
        if (nrnull[i] == linenr - 1) {
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
        }
        message.append(Const.CR);
    }
    monitor.worked(1);
    monitor.done();
    return message.toString();
}
Also used : RowMeta(org.pentaho.di.core.row.RowMeta) DecimalFormat(java.text.DecimalFormat) TextFileInputField(org.pentaho.di.trans.steps.textfileinput.TextFileInputField) ArrayList(java.util.ArrayList) RowMetaInterface(org.pentaho.di.core.row.RowMetaInterface) TextFileLine(org.pentaho.di.trans.steps.textfileinput.TextFileLine) StringEvaluationResult(org.pentaho.di.core.util.StringEvaluationResult) InputFileMetaInterface(org.pentaho.di.trans.steps.textfileinput.InputFileMetaInterface) DecimalFormatSymbols(java.text.DecimalFormatSymbols) Date(java.util.Date) KettleException(org.pentaho.di.core.exception.KettleException) InvocationTargetException(java.lang.reflect.InvocationTargetException) ValueMetaInterface(org.pentaho.di.core.row.ValueMetaInterface) StringEvaluator(org.pentaho.di.core.util.StringEvaluator) SimpleDateFormat(java.text.SimpleDateFormat)

Aggregations

InvocationTargetException (java.lang.reflect.InvocationTargetException)1 DecimalFormat (java.text.DecimalFormat)1 DecimalFormatSymbols (java.text.DecimalFormatSymbols)1 SimpleDateFormat (java.text.SimpleDateFormat)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 KettleException (org.pentaho.di.core.exception.KettleException)1 RowMeta (org.pentaho.di.core.row.RowMeta)1 RowMetaInterface (org.pentaho.di.core.row.RowMetaInterface)1 ValueMetaInterface (org.pentaho.di.core.row.ValueMetaInterface)1 StringEvaluationResult (org.pentaho.di.core.util.StringEvaluationResult)1 StringEvaluator (org.pentaho.di.core.util.StringEvaluator)1 InputFileMetaInterface (org.pentaho.di.trans.steps.textfileinput.InputFileMetaInterface)1 TextFileInputField (org.pentaho.di.trans.steps.textfileinput.TextFileInputField)1 TextFileLine (org.pentaho.di.trans.steps.textfileinput.TextFileLine)1