Search in sources :

Example 11 with BaseFileField

use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.

the class TextFileInputUtils method guessStringsFromLine.

public static final String[] guessStringsFromLine(VariableSpace space, LogChannelInterface log, String line, TextFileInputMeta inf, String delimiter, String enclosure, String escapeCharacter) throws KettleException {
    List<String> strings = new ArrayList<>();
    // piece of line
    String pol;
    try {
        if (line == null) {
            return null;
        }
        if (inf.content.fileType.equalsIgnoreCase("CSV")) {
            // Split string in pieces, only for CSV!
            int pos = 0;
            int length = line.length();
            boolean dencl = false;
            int len_encl = (enclosure == null ? 0 : enclosure.length());
            int len_esc = (escapeCharacter == null ? 0 : escapeCharacter.length());
            while (pos < length) {
                int from = pos;
                int next;
                boolean encl_found;
                boolean contains_escaped_enclosures = false;
                boolean contains_escaped_separators = false;
                boolean contains_escaped_escape = false;
                // "aa;aa";123;"aaa-aaa";000;...
                if (len_encl > 0 && line.substring(from, from + len_encl).equalsIgnoreCase(enclosure)) {
                    if (log.isRowLevel()) {
                        log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRow", line.substring(from, from + len_encl)));
                    }
                    encl_found = true;
                    int p = from + len_encl;
                    boolean is_enclosure = len_encl > 0 && p + len_encl < length && line.substring(p, p + len_encl).equalsIgnoreCase(enclosure);
                    boolean is_escape = len_esc > 0 && p + len_esc < length && line.substring(p, p + len_esc).equalsIgnoreCase(escapeCharacter);
                    boolean enclosure_after = false;
                    // Is it really an enclosure? See if it's not repeated twice or escaped!
                    if ((is_enclosure || is_escape) && p < length - 1) {
                        String strnext = line.substring(p + len_encl, p + 2 * len_encl);
                        if (strnext.equalsIgnoreCase(enclosure)) {
                            p++;
                            enclosure_after = true;
                            dencl = true;
                            // Remember to replace them later on!
                            if (is_escape) {
                                contains_escaped_enclosures = true;
                            }
                        } else if (strnext.equals(escapeCharacter)) {
                            p++;
                            // Remember to replace them later on!
                            if (is_escape) {
                                // remember
                                contains_escaped_escape = true;
                            }
                        }
                    }
                    // Look for a closing enclosure!
                    while ((!is_enclosure || enclosure_after) && p < line.length()) {
                        p++;
                        enclosure_after = false;
                        is_enclosure = len_encl > 0 && p + len_encl < length && line.substring(p, p + len_encl).equals(enclosure);
                        is_escape = len_esc > 0 && p + len_esc < length && line.substring(p, p + len_esc).equals(escapeCharacter);
                        // Is it really an enclosure? See if it's not repeated twice or escaped!
                        if ((is_enclosure || is_escape) && p < length - 1) {
                            String strnext = line.substring(p + len_encl, p + 2 * len_encl);
                            if (strnext.equals(enclosure)) {
                                p++;
                                enclosure_after = true;
                                dencl = true;
                                // Remember to replace them later on!
                                if (is_escape) {
                                    // remember
                                    contains_escaped_enclosures = true;
                                }
                            } else if (strnext.equals(escapeCharacter)) {
                                p++;
                                // Remember to replace them later on!
                                if (is_escape) {
                                    // remember
                                    contains_escaped_escape = true;
                                }
                            }
                        }
                    }
                    if (p >= length) {
                        next = p;
                    } else {
                        next = p + len_encl;
                    }
                    if (log.isRowLevel()) {
                        log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.EndOfEnclosure", "" + p));
                    }
                } else {
                    encl_found = false;
                    boolean found = false;
                    int startpoint = from;
                    // int tries = 1;
                    do {
                        next = line.indexOf(delimiter, startpoint);
                        // See if this position is preceded by an escape character.
                        if (len_esc > 0 && next - len_esc > 0) {
                            String before = line.substring(next - len_esc, next);
                            if (escapeCharacter.equals(before)) {
                                // take the next separator, this one is escaped...
                                startpoint = next + 1;
                                // tries++;
                                contains_escaped_separators = true;
                            } else {
                                found = true;
                            }
                        } else {
                            found = true;
                        }
                    } while (!found && next >= 0);
                }
                if (next == -1) {
                    next = length;
                }
                if (encl_found) {
                    pol = line.substring(from + len_encl, next - len_encl);
                    if (log.isRowLevel()) {
                        log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.EnclosureFieldFound", "" + pol));
                    }
                } else {
                    pol = line.substring(from, next);
                    if (log.isRowLevel()) {
                        log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.NormalFieldFound", "" + pol));
                    }
                }
                if (dencl) {
                    StringBuilder sbpol = new StringBuilder(pol);
                    int idx = sbpol.indexOf(enclosure + enclosure);
                    while (idx >= 0) {
                        sbpol.delete(idx, idx + enclosure.length());
                        idx = sbpol.indexOf(enclosure + enclosure);
                    }
                    pol = sbpol.toString();
                }
                // replace the escaped enclosures with enclosures...
                if (contains_escaped_enclosures) {
                    String replace = escapeCharacter + enclosure;
                    String replaceWith = enclosure;
                    pol = Const.replace(pol, replace, replaceWith);
                }
                // replace the escaped separators with separators...
                if (contains_escaped_separators) {
                    String replace = escapeCharacter + delimiter;
                    String replaceWith = delimiter;
                    pol = Const.replace(pol, replace, replaceWith);
                }
                // replace the escaped escape with escape...
                if (contains_escaped_escape) {
                    String replace = escapeCharacter + escapeCharacter;
                    String replaceWith = escapeCharacter;
                    pol = Const.replace(pol, replace, replaceWith);
                }
                // Now add pol to the strings found!
                strings.add(pol);
                pos = next + delimiter.length();
            }
            if (pos == length) {
                if (log.isRowLevel()) {
                    log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.EndOfEmptyLineFound"));
                }
                strings.add("");
            }
        } else {
            // Fixed file format: Simply get the strings at the required positions...
            for (int i = 0; i < inf.inputFields.length; i++) {
                BaseFileField field = inf.inputFields[i];
                int length = line.length();
                if (field.getPosition() + field.getLength() <= length) {
                    strings.add(line.substring(field.getPosition(), field.getPosition() + field.getLength()));
                } else {
                    if (field.getPosition() < length) {
                        strings.add(line.substring(field.getPosition()));
                    } else {
                        strings.add("");
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new KettleException(BaseMessages.getString(PKG, "TextFileInput.Log.Error.ErrorConvertingLine", e.toString()), e);
    }
    return strings.toArray(new String[strings.size()]);
}
Also used : KettleException(org.pentaho.di.core.exception.KettleException) ArrayList(java.util.ArrayList) BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) KettleException(org.pentaho.di.core.exception.KettleException) KettleFileException(org.pentaho.di.core.exception.KettleFileException)

Example 12 with BaseFileField

use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.

the class TextFileInputContentParsingTest method testDefaultOptions.

@Test
public void testDefaultOptions() throws Exception {
    initByFile("default.csv");
    setFields(new BaseFileField("f1", -1, -1), new BaseFileField("f2", -1, -1), new BaseFileField("f2", -1, -1));
    process();
    check(new Object[][] { { "first", "1", "1.1" }, { "second", "2", "2.2" }, { "third", "3", "3.3" } });
}
Also used : BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) Test(org.junit.Test)

Example 13 with BaseFileField

use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.

the class TextFileInputContentParsingTest method testFixedWidthBytes.

@Test
public void testFixedWidthBytes() throws Exception {
    meta.content.header = false;
    meta.content.fileType = "Fixed";
    meta.content.fileFormat = "Unix";
    meta.content.encoding = "Shift_JIS";
    meta.content.length = "Bytes";
    initByFile("test-fixed-length-bytes.txt");
    setFields(new BaseFileField("f1", 0, 5), new BaseFileField("f2", 5, 3), new BaseFileField("f3", 8, 1), new BaseFileField("f4", 9, 3));
    process();
    check(new Object[][] { { "1.000", "個 ", "T", "1.0" }, { "2.000", "M  ", "Z", "1.0" } });
}
Also used : BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) Test(org.junit.Test)

Example 14 with BaseFileField

use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.

the class TextFileInputContentParsingTest method testFilterVariables.

@Test
public void testFilterVariables() throws Exception {
    initByFile("default.csv");
    Variables vars = new Variables();
    vars.setVariable("VAR_TEST", "second");
    data.filterProcessor = new TextFileFilterProcessor(new TextFileFilter[] { new TextFileFilter(0, "${VAR_TEST}", false, false) }, vars);
    setFields(new BaseFileField("f1", -1, -1), new BaseFileField("f2", -1, -1), new BaseFileField("f2", -1, -1));
    process();
    check(new Object[][] { { "first", "1", "1.1" }, { "third", "3", "3.3" } });
}
Also used : Variables(org.pentaho.di.core.variables.Variables) BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) Test(org.junit.Test)

Example 15 with BaseFileField

use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.

the class TextFileInputContentParsingTest method testSeparator.

@Test
public void testSeparator() throws Exception {
    meta.content.separator = ",";
    initByFile("separator.csv");
    setFields(new BaseFileField("f1", -1, -1), new BaseFileField("f2", -1, -1), new BaseFileField("f2", -1, -1));
    process();
    check(new Object[][] { { "first", "1", "1.1" }, { "second", "2", "2.2" }, { "third;third", "3", "3.3" } });
}
Also used : BaseFileField(org.pentaho.di.trans.steps.file.BaseFileField) Test(org.junit.Test)

Aggregations

BaseFileField (org.pentaho.di.trans.steps.file.BaseFileField)36 Test (org.junit.Test)19 ValueMetaString (org.pentaho.di.core.row.value.ValueMetaString)12 KettleException (org.pentaho.di.core.exception.KettleException)9 KettleFileException (org.pentaho.di.core.exception.KettleFileException)7 ValueMetaInterface (org.pentaho.di.core.row.ValueMetaInterface)5 ArrayList (java.util.ArrayList)4 KettleStepException (org.pentaho.di.core.exception.KettleStepException)4 KettleXMLException (org.pentaho.di.core.exception.KettleXMLException)4 TableItem (org.eclipse.swt.widgets.TableItem)3 RowMetaInterface (org.pentaho.di.core.row.RowMetaInterface)3 Shell (org.eclipse.swt.widgets.Shell)2 TextFileInputFieldInterface (org.pentaho.di.core.gui.TextFileInputFieldInterface)2 RowMeta (org.pentaho.di.core.row.RowMeta)2 Variables (org.pentaho.di.core.variables.Variables)2 TextFileInputMeta (org.pentaho.di.trans.steps.fileinput.text.TextFileInputMeta)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1