Search in sources :

Example 1 with IInputFileMeta

use of org.apache.hop.core.file.IInputFileMeta in project hop by apache.

the class TextFileCSVImportProgressDialog method doScan.

private String doScan(IProgressMonitor monitor, final boolean failOnParseError) throws HopException {
    if (samples > 0) {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
    } else {
        monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }
    String line = "";
    long fileLineNumber = 0;
    DecimalFormatSymbols dfs = new DecimalFormatSymbols();
    int nrFields = meta.getInputFields().length;
    IRowMeta outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, variables, null);
    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (IValueMeta valueMeta : outputRowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(IValueMeta.STORAGE_TYPE_NORMAL);
    }
    IRowMeta convertRowMeta = outputRowMeta.cloneToType(IValueMeta.TYPE_STRING);
    // How many null values?
    // How many times null value?
    int[] nrnull = new int[nrFields];
    // String info
    // min string
    String[] minstr = new String[nrFields];
    // max string
    String[] maxstr = new String[nrFields];
    // first occ. of string?
    boolean[] firststr = new boolean[nrFields];
    // Date info
    // is the field perhaps a Date?
    boolean[] isDate = new boolean[nrFields];
    // How many date formats work?
    int[] dateFormatCount = new int[nrFields];
    boolean[][] dateFormat = // What are the date formats that
    new boolean[nrFields][Const.getDateFormats().length];
    // work?
    // min date value
    Date[][] minDate = new Date[nrFields][Const.getDateFormats().length];
    // max date value
    Date[][] maxDate = new Date[nrFields][Const.getDateFormats().length];
    // Number info
    // is the field perhaps a Number?
    boolean[] isNumber = new boolean[nrFields];
    // How many number formats work?
    int[] numberFormatCount = new int[nrFields];
    boolean[][] numberFormat = // What are the number format
    new boolean[nrFields][Const.getNumberFormats().length];
    // that work?
    // min number value
    double[][] minValue = new double[nrFields][Const.getDateFormats().length];
    // max number value
    double[][] maxValue = new double[nrFields][Const.getDateFormats().length];
    int[][] numberPrecision = // remember the precision?
    new int[nrFields][Const.getNumberFormats().length];
    int[][] numberLength = // remember the length?
    new int[nrFields][Const.getNumberFormats().length];
    for (int i = 0; i < nrFields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        if (log.isDebug()) {
            debug = "init field #" + i;
        }
        if (replaceMeta) {
            // Clear previous info...
            field.setName(meta.getInputFields()[i].getName());
            field.setType(meta.getInputFields()[i].getType());
            field.setFormat("");
            field.setLength(-1);
            field.setPrecision(-1);
            field.setCurrencySymbol(dfs.getCurrencySymbol());
            field.setDecimalSymbol("" + dfs.getDecimalSeparator());
            field.setGroupSymbol("" + dfs.getGroupingSeparator());
            field.setNullString("-");
            field.setTrimType(IValueMeta.TRIM_TYPE_NONE);
        }
        nrnull[i] = 0;
        minstr[i] = "";
        maxstr[i] = "";
        firststr[i] = true;
        // Init data guess
        isDate[i] = true;
        for (int j = 0; j < Const.getDateFormats().length; j++) {
            dateFormat[i][j] = true;
            minDate[i][j] = Const.MAX_DATE;
            maxDate[i][j] = Const.MIN_DATE;
        }
        dateFormatCount[i] = Const.getDateFormats().length;
        // Init number guess
        isNumber[i] = true;
        for (int j = 0; j < Const.getNumberFormats().length; j++) {
            numberFormat[i][j] = true;
            minValue[i][j] = Double.MAX_VALUE;
            maxValue[i][j] = -Double.MAX_VALUE;
            numberPrecision[i][j] = -1;
            numberLength[i][j] = -1;
        }
        numberFormatCount[i] = Const.getNumberFormats().length;
    }
    IInputFileMeta strinfo = (IInputFileMeta) meta.clone();
    for (int i = 0; i < nrFields; i++) {
        strinfo.getInputFields()[i].setType(IValueMeta.TYPE_STRING);
    }
    // Sample <samples> rows...
    debug = "get first line";
    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();
    // If the file has a header we overwrite the first line
    // However, if it doesn't have a header, take a new line
    // 
    line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    fileLineNumber++;
    if (meta.hasHeader()) {
        int skipped = 0;
        while (line != null && skipped < meta.getNrHeaderLines()) {
            line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
            skipped++;
            fileLineNumber++;
        }
    }
    int linenr = 1;
    List<StringEvaluator> evaluators = new ArrayList<>();
    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();
    boolean errorFound = false;
    while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
        monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
        if (samples > 0) {
            monitor.worked(1);
        }
        if (log.isDebug()) {
            debug = "convert line #" + linenr + " to row";
        }
        IRowMeta rowMeta = new RowMeta();
        meta.getFields(rowMeta, "transformName", null, null, variables, null);
        // Remove the storage meta-data (don't go for lazy conversion during scan)
        for (IValueMeta valueMeta : rowMeta.getValueMetaList()) {
            valueMeta.setStorageMetadata(null);
            valueMeta.setStorageType(IValueMeta.STORAGE_TYPE_NORMAL);
        }
        String delimiter = variables.resolve(meta.getSeparator());
        String enclosure = variables.resolve(meta.getEnclosure());
        String escapeCharacter = variables.resolve(meta.getEscapeCharacter());
        Object[] r = TextFileInput.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, meta.getFilePaths(variables)[0], rownumber, delimiter, enclosure, escapeCharacter, null, false, false, false, false, false, false, false, false, null, null, false, null, null, null, null, 0, failOnParseError);
        if (r == null) {
            errorFound = true;
            continue;
        }
        rownumber++;
        for (int i = 0; i < nrFields && i < r.length; i++) {
            StringEvaluator evaluator;
            if (i >= evaluators.size()) {
                evaluator = new StringEvaluator(true);
                evaluators.add(evaluator);
            } else {
                evaluator = evaluators.get(i);
            }
            String string = getStringFromRow(rowMeta, r, i, failOnParseError);
            if (i == 0) {
                System.out.println();
            }
            evaluator.evaluateString(string);
        }
        fileLineNumber++;
        if (r != null) {
            linenr++;
        }
        // Grab another line...
        // 
        line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    }
    monitor.worked(1);
    monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
    // Show information on items using a dialog box
    // 
    StringBuilder message = new StringBuilder();
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
    for (int i = 0; i < nrFields; i++) {
        TextFileInputField field = meta.getInputFields()[i];
        StringEvaluator evaluator = evaluators.get(i);
        List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
        // If we didn't find any matching result, it's a String...
        // 
        StringEvaluationResult result = evaluator.getAdvicedResult();
        if (evaluationResults.isEmpty()) {
            field.setType(IValueMeta.TYPE_STRING);
            field.setLength(evaluator.getMaxLength());
        }
        if (result != null) {
            // Take the first option we find, list the others below...
            // 
            IValueMeta conversionMeta = result.getConversionMeta();
            field.setType(conversionMeta.getType());
            field.setTrimType(conversionMeta.getTrimType());
            field.setFormat(conversionMeta.getConversionMask());
            field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
            field.setGroupSymbol(conversionMeta.getGroupingSymbol());
            field.setLength(conversionMeta.getLength());
            field.setPrecision(conversionMeta.getPrecision());
            nrnull[i] = result.getNrNull();
            minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
            maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
        }
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
        switch(field.getType()) {
            case IValueMeta.TYPE_NUMBER:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
                if (!evaluationResults.isEmpty()) {
                    if (evaluationResults.size() > 1) {
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
                    }
                    for (StringEvaluationResult seResult : evaluationResults) {
                        String mask = seResult.getConversionMeta().getConversionMask();
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
                        message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
                        try {
                            df2.applyPattern(mask);
                            df2.setDecimalFormatSymbols(dfs2);
                            double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
                        } catch (Exception e) {
                            if (log.isDetailed()) {
                                log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
                break;
            case IValueMeta.TYPE_STRING:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
                break;
            case IValueMeta.TYPE_DATE:
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
                if (dateFormatCount[i] > 1) {
                    message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
                }
                if (!Utils.isEmpty(minstr[i])) {
                    for (int x = 0; x < Const.getDateFormats().length; x++) {
                        if (dateFormat[i][x]) {
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
                            Date mindate = minDate[i][x];
                            Date maxdate = maxDate[i][x];
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
                            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
                            daf2.applyPattern(Const.getDateFormats()[x]);
                            try {
                                Date md = daf2.parse(minstr[i]);
                                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
                            } catch (Exception e) {
                                if (log.isDetailed()) {
                                    log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
                                }
                            }
                        }
                    }
                }
                message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
                break;
            default:
                break;
        }
        if (nrnull[i] == linenr - 1) {
            message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
        }
        message.append(Const.CR);
    }
    monitor.worked(1);
    monitor.done();
    return message.toString();
}
Also used : RowMeta(org.apache.hop.core.row.RowMeta) IRowMeta(org.apache.hop.core.row.IRowMeta) DecimalFormat(java.text.DecimalFormat) TextFileInputField(org.apache.hop.core.file.TextFileInputField) ArrayList(java.util.ArrayList) IInputFileMeta(org.apache.hop.core.file.IInputFileMeta) StringEvaluationResult(org.apache.hop.core.util.StringEvaluationResult) DecimalFormatSymbols(java.text.DecimalFormatSymbols) IRowMeta(org.apache.hop.core.row.IRowMeta) Date(java.util.Date) HopException(org.apache.hop.core.exception.HopException) InvocationTargetException(java.lang.reflect.InvocationTargetException) IValueMeta(org.apache.hop.core.row.IValueMeta) StringEvaluator(org.apache.hop.core.util.StringEvaluator) SimpleDateFormat(java.text.SimpleDateFormat)

Example 2 with IInputFileMeta

use of org.apache.hop.core.file.IInputFileMeta in project hop by apache.

the class TextFileInputTest method convertLineToRowTest.

/**
 * @throws Exception
 */
@Test
public void convertLineToRowTest() throws Exception {
    ILogChannel log = Mockito.mock(ILogChannel.class);
    TextFileLine textFileLine = Mockito.mock(TextFileLine.class);
    textFileLine.line = "testData1;testData2;testData3";
    IInputFileMeta info = Mockito.mock(IInputFileMeta.class);
    TextFileInputField[] textFileInputFields = { new TextFileInputField(), new TextFileInputField(), new TextFileInputField() };
    Mockito.doReturn(textFileInputFields).when(info).getInputFields();
    Mockito.doReturn("CSV").when(info).getFileType();
    Mockito.doReturn("/").when(info).getEscapeCharacter();
    Mockito.doReturn(true).when(info).isErrorIgnored();
    Mockito.doReturn(true).when(info).isErrorLineSkipped();
    IRowMeta outputRowMeta = Mockito.mock(IRowMeta.class);
    Mockito.doReturn(15).when(outputRowMeta).size();
    IValueMeta valueMetaWithError = Mockito.mock(IValueMeta.class);
    Mockito.doThrow(new HopValueException("Error converting")).when(valueMetaWithError).convertDataFromString(Mockito.anyString(), Mockito.any(IValueMeta.class), Mockito.anyString(), Mockito.anyString(), Mockito.anyInt());
    Mockito.doReturn(valueMetaWithError).when(outputRowMeta).getValueMeta(Mockito.anyInt());
    // it should run without NPE
    TextFileInput.convertLineToRow(log, textFileLine, info, new Object[3], 1, outputRowMeta, Mockito.mock(IRowMeta.class), null, 1L, ";", null, "/", Mockito.mock(IFileErrorHandler.class), false, false, false, false, false, false, false, false, null, null, false, new Date(), null, null, null, 1L);
}
Also used : IFileErrorHandler(org.apache.hop.pipeline.transform.errorhandling.IFileErrorHandler) IValueMeta(org.apache.hop.core.row.IValueMeta) IInputFileMeta(org.apache.hop.core.file.IInputFileMeta) ILogChannel(org.apache.hop.core.logging.ILogChannel) IRowMeta(org.apache.hop.core.row.IRowMeta) TextFileInputField(org.apache.hop.core.file.TextFileInputField) HopValueException(org.apache.hop.core.exception.HopValueException) Date(java.util.Date) Test(org.junit.Test)

Aggregations

Date (java.util.Date)2 IInputFileMeta (org.apache.hop.core.file.IInputFileMeta)2 TextFileInputField (org.apache.hop.core.file.TextFileInputField)2 IRowMeta (org.apache.hop.core.row.IRowMeta)2 IValueMeta (org.apache.hop.core.row.IValueMeta)2 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 DecimalFormat (java.text.DecimalFormat)1 DecimalFormatSymbols (java.text.DecimalFormatSymbols)1 SimpleDateFormat (java.text.SimpleDateFormat)1 ArrayList (java.util.ArrayList)1 HopException (org.apache.hop.core.exception.HopException)1 HopValueException (org.apache.hop.core.exception.HopValueException)1 ILogChannel (org.apache.hop.core.logging.ILogChannel)1 RowMeta (org.apache.hop.core.row.RowMeta)1 StringEvaluationResult (org.apache.hop.core.util.StringEvaluationResult)1 StringEvaluator (org.apache.hop.core.util.StringEvaluator)1 IFileErrorHandler (org.apache.hop.pipeline.transform.errorhandling.IFileErrorHandler)1 Test (org.junit.Test)1