use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.
the class TextFileInputUtils method guessStringsFromLine.
public static final String[] guessStringsFromLine(VariableSpace space, LogChannelInterface log, String line, TextFileInputMeta inf, String delimiter, String enclosure, String escapeCharacter) throws KettleException {
List<String> strings = new ArrayList<>();
// piece of line
String pol;
try {
if (line == null) {
return null;
}
if (inf.content.fileType.equalsIgnoreCase("CSV")) {
// Split string in pieces, only for CSV!
int pos = 0;
int length = line.length();
boolean dencl = false;
int len_encl = (enclosure == null ? 0 : enclosure.length());
int len_esc = (escapeCharacter == null ? 0 : escapeCharacter.length());
while (pos < length) {
int from = pos;
int next;
boolean encl_found;
boolean contains_escaped_enclosures = false;
boolean contains_escaped_separators = false;
boolean contains_escaped_escape = false;
// "aa;aa";123;"aaa-aaa";000;...
if (len_encl > 0 && line.substring(from, from + len_encl).equalsIgnoreCase(enclosure)) {
if (log.isRowLevel()) {
log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRow", line.substring(from, from + len_encl)));
}
encl_found = true;
int p = from + len_encl;
boolean is_enclosure = len_encl > 0 && p + len_encl < length && line.substring(p, p + len_encl).equalsIgnoreCase(enclosure);
boolean is_escape = len_esc > 0 && p + len_esc < length && line.substring(p, p + len_esc).equalsIgnoreCase(escapeCharacter);
boolean enclosure_after = false;
// Is it really an enclosure? See if it's not repeated twice or escaped!
if ((is_enclosure || is_escape) && p < length - 1) {
String strnext = line.substring(p + len_encl, p + 2 * len_encl);
if (strnext.equalsIgnoreCase(enclosure)) {
p++;
enclosure_after = true;
dencl = true;
// Remember to replace them later on!
if (is_escape) {
contains_escaped_enclosures = true;
}
} else if (strnext.equals(escapeCharacter)) {
p++;
// Remember to replace them later on!
if (is_escape) {
// remember
contains_escaped_escape = true;
}
}
}
// Look for a closing enclosure!
while ((!is_enclosure || enclosure_after) && p < line.length()) {
p++;
enclosure_after = false;
is_enclosure = len_encl > 0 && p + len_encl < length && line.substring(p, p + len_encl).equals(enclosure);
is_escape = len_esc > 0 && p + len_esc < length && line.substring(p, p + len_esc).equals(escapeCharacter);
// Is it really an enclosure? See if it's not repeated twice or escaped!
if ((is_enclosure || is_escape) && p < length - 1) {
String strnext = line.substring(p + len_encl, p + 2 * len_encl);
if (strnext.equals(enclosure)) {
p++;
enclosure_after = true;
dencl = true;
// Remember to replace them later on!
if (is_escape) {
// remember
contains_escaped_enclosures = true;
}
} else if (strnext.equals(escapeCharacter)) {
p++;
// Remember to replace them later on!
if (is_escape) {
// remember
contains_escaped_escape = true;
}
}
}
}
if (p >= length) {
next = p;
} else {
next = p + len_encl;
}
if (log.isRowLevel()) {
log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.EndOfEnclosure", "" + p));
}
} else {
encl_found = false;
boolean found = false;
int startpoint = from;
// int tries = 1;
do {
next = line.indexOf(delimiter, startpoint);
// See if this position is preceded by an escape character.
if (len_esc > 0 && next - len_esc > 0) {
String before = line.substring(next - len_esc, next);
if (escapeCharacter.equals(before)) {
// take the next separator, this one is escaped...
startpoint = next + 1;
// tries++;
contains_escaped_separators = true;
} else {
found = true;
}
} else {
found = true;
}
} while (!found && next >= 0);
}
if (next == -1) {
next = length;
}
if (encl_found) {
pol = line.substring(from + len_encl, next - len_encl);
if (log.isRowLevel()) {
log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.EnclosureFieldFound", "" + pol));
}
} else {
pol = line.substring(from, next);
if (log.isRowLevel()) {
log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.NormalFieldFound", "" + pol));
}
}
if (dencl) {
StringBuilder sbpol = new StringBuilder(pol);
int idx = sbpol.indexOf(enclosure + enclosure);
while (idx >= 0) {
sbpol.delete(idx, idx + enclosure.length());
idx = sbpol.indexOf(enclosure + enclosure);
}
pol = sbpol.toString();
}
// replace the escaped enclosures with enclosures...
if (contains_escaped_enclosures) {
String replace = escapeCharacter + enclosure;
String replaceWith = enclosure;
pol = Const.replace(pol, replace, replaceWith);
}
// replace the escaped separators with separators...
if (contains_escaped_separators) {
String replace = escapeCharacter + delimiter;
String replaceWith = delimiter;
pol = Const.replace(pol, replace, replaceWith);
}
// replace the escaped escape with escape...
if (contains_escaped_escape) {
String replace = escapeCharacter + escapeCharacter;
String replaceWith = escapeCharacter;
pol = Const.replace(pol, replace, replaceWith);
}
// Now add pol to the strings found!
strings.add(pol);
pos = next + delimiter.length();
}
if (pos == length) {
if (log.isRowLevel()) {
log.logRowlevel(BaseMessages.getString(PKG, "TextFileInput.Log.ConvertLineToRowTitle"), BaseMessages.getString(PKG, "TextFileInput.Log.EndOfEmptyLineFound"));
}
strings.add("");
}
} else {
// Fixed file format: Simply get the strings at the required positions...
for (int i = 0; i < inf.inputFields.length; i++) {
BaseFileField field = inf.inputFields[i];
int length = line.length();
if (field.getPosition() + field.getLength() <= length) {
strings.add(line.substring(field.getPosition(), field.getPosition() + field.getLength()));
} else {
if (field.getPosition() < length) {
strings.add(line.substring(field.getPosition()));
} else {
strings.add("");
}
}
}
}
} catch (Exception e) {
throw new KettleException(BaseMessages.getString(PKG, "TextFileInput.Log.Error.ErrorConvertingLine", e.toString()), e);
}
return strings.toArray(new String[strings.size()]);
}
use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.
the class TextFileInputContentParsingTest method testDefaultOptions.
@Test
public void testDefaultOptions() throws Exception {
initByFile("default.csv");
setFields(new BaseFileField("f1", -1, -1), new BaseFileField("f2", -1, -1), new BaseFileField("f2", -1, -1));
process();
check(new Object[][] { { "first", "1", "1.1" }, { "second", "2", "2.2" }, { "third", "3", "3.3" } });
}
use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.
the class TextFileInputContentParsingTest method testFixedWidthBytes.
@Test
public void testFixedWidthBytes() throws Exception {
meta.content.header = false;
meta.content.fileType = "Fixed";
meta.content.fileFormat = "Unix";
meta.content.encoding = "Shift_JIS";
meta.content.length = "Bytes";
initByFile("test-fixed-length-bytes.txt");
setFields(new BaseFileField("f1", 0, 5), new BaseFileField("f2", 5, 3), new BaseFileField("f3", 8, 1), new BaseFileField("f4", 9, 3));
process();
check(new Object[][] { { "1.000", "個 ", "T", "1.0" }, { "2.000", "M ", "Z", "1.0" } });
}
use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.
the class TextFileInputContentParsingTest method testFilterVariables.
@Test
public void testFilterVariables() throws Exception {
initByFile("default.csv");
Variables vars = new Variables();
vars.setVariable("VAR_TEST", "second");
data.filterProcessor = new TextFileFilterProcessor(new TextFileFilter[] { new TextFileFilter(0, "${VAR_TEST}", false, false) }, vars);
setFields(new BaseFileField("f1", -1, -1), new BaseFileField("f2", -1, -1), new BaseFileField("f2", -1, -1));
process();
check(new Object[][] { { "first", "1", "1.1" }, { "third", "3", "3.3" } });
}
use of org.pentaho.di.trans.steps.file.BaseFileField in project pentaho-kettle by pentaho.
the class TextFileInputContentParsingTest method testSeparator.
@Test
public void testSeparator() throws Exception {
meta.content.separator = ",";
initByFile("separator.csv");
setFields(new BaseFileField("f1", -1, -1), new BaseFileField("f2", -1, -1), new BaseFileField("f2", -1, -1));
process();
check(new Object[][] { { "first", "1", "1.1" }, { "second", "2", "2.2" }, { "third;third", "3", "3.3" } });
}
Aggregations