use of org.pentaho.di.trans.steps.textfileinput.EncodingType in project pentaho-kettle by pentaho.
the class TextFileInputDialog method getFirst.
// Get the first x lines
private List<String> getFirst(int nrlines, boolean skipHeaders) throws KettleException {
TextFileInputMeta meta = new TextFileInputMeta();
getInfo(meta);
FileInputList textFileList = meta.getTextFileList(transMeta);
InputStream fi;
CompressionInputStream f = null;
StringBuilder lineStringBuilder = new StringBuilder(256);
int fileFormatType = meta.getFileFormatTypeNr();
List<String> retval = new ArrayList<String>();
if (textFileList.nrOfFiles() > 0) {
FileObject file = textFileList.getFile(0);
try {
fi = KettleVFS.getInputStream(file);
CompressionProvider provider = CompressionProviderFactory.getInstance().createCompressionProviderInstance(meta.getFileCompression());
f = provider.createInputStream(fi);
InputStreamReader reader;
if (meta.getEncoding() != null && meta.getEncoding().length() > 0) {
reader = new InputStreamReader(f, meta.getEncoding());
} else {
reader = new InputStreamReader(f);
}
EncodingType encodingType = EncodingType.guessEncodingType(reader.getEncoding());
int linenr = 0;
int maxnr = nrlines + (meta.hasHeader() ? meta.getNrHeaderLines() : 0);
if (skipHeaders) {
// Skip the header lines first if more then one, it helps us position
if (meta.isLayoutPaged() && meta.getNrLinesDocHeader() > 0) {
int skipped = 0;
String line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
while (line != null && skipped < meta.getNrLinesDocHeader() - 1) {
skipped++;
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
}
}
// Skip the header lines first if more then one, it helps us position
if (meta.hasHeader() && meta.getNrHeaderLines() > 0) {
int skipped = 0;
String line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
while (line != null && skipped < meta.getNrHeaderLines() - 1) {
skipped++;
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
}
}
}
String line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
while (line != null && (linenr < maxnr || nrlines == 0)) {
retval.add(line);
linenr++;
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
}
} catch (Exception e) {
throw new KettleException(BaseMessages.getString(PKG, "TextFileInputDialog.Exception.ErrorGettingFirstLines", "" + nrlines, file.getName().getURI()), e);
} finally {
try {
if (f != null) {
f.close();
}
} catch (Exception e) {
// Ignore errors
}
}
}
return retval;
}
use of org.pentaho.di.trans.steps.textfileinput.EncodingType in project pentaho-kettle by pentaho.
the class CsvInput method readFieldNamesFromFile.
String[] readFieldNamesFromFile(String fileName, CsvInputMeta csvInputMeta) throws KettleException {
String delimiter = environmentSubstitute(csvInputMeta.getDelimiter());
String enclosure = environmentSubstitute(csvInputMeta.getEnclosure());
String realEncoding = environmentSubstitute(csvInputMeta.getEncoding());
try (FileObject fileObject = KettleVFS.getFileObject(fileName, getTransMeta());
BOMInputStream inputStream = new BOMInputStream(KettleVFS.getInputStream(fileObject), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE)) {
InputStreamReader reader = null;
if (Utils.isEmpty(realEncoding)) {
reader = new InputStreamReader(inputStream);
} else {
reader = new InputStreamReader(inputStream, realEncoding);
}
EncodingType encodingType = EncodingType.guessEncodingType(reader.getEncoding());
String line = TextFileInput.getLine(log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_UNIX, new StringBuilder(1000));
String[] fieldNames = CsvInput.guessStringsFromLine(log, line, delimiter, enclosure, csvInputMeta.getEscapeCharacter());
if (!Utils.isEmpty(csvInputMeta.getEnclosure())) {
removeEnclosure(fieldNames, csvInputMeta.getEnclosure());
}
trimFieldNames(fieldNames);
return fieldNames;
} catch (IOException e) {
throw new KettleFileException(BaseMessages.getString(PKG, "CsvInput.Exception.CreateFieldMappingError"), e);
}
}
use of org.pentaho.di.trans.steps.textfileinput.EncodingType in project pentaho-kettle by pentaho.
the class TextFileInputDialog method getCSV.
// Get the data layout
private void getCSV() {
TextFileInputMeta meta = new TextFileInputMeta();
getInfo(meta);
TextFileInputMeta previousMeta = (TextFileInputMeta) meta.clone();
FileInputList textFileList = meta.getTextFileList(transMeta);
InputStream fileInputStream;
CompressionInputStream inputStream = null;
StringBuilder lineStringBuilder = new StringBuilder(256);
int fileFormatType = meta.getFileFormatTypeNr();
String delimiter = transMeta.environmentSubstitute(meta.getSeparator());
String enclosure = transMeta.environmentSubstitute(meta.getEnclosure());
String escapeCharacter = transMeta.environmentSubstitute(meta.getEscapeCharacter());
if (textFileList.nrOfFiles() > 0) {
int clearFields = meta.hasHeader() ? SWT.YES : SWT.NO;
int nrInputFields = meta.getInputFields().length;
if (meta.hasHeader() && nrInputFields > 0) {
MessageBox mb = new MessageBox(shell, SWT.YES | SWT.NO | SWT.CANCEL | SWT.ICON_QUESTION);
mb.setMessage(BaseMessages.getString(PKG, "TextFileInputDialog.ClearFieldList.DialogMessage"));
mb.setText(BaseMessages.getString(PKG, "TextFileInputDialog.ClearFieldList.DialogTitle"));
clearFields = mb.open();
if (clearFields == SWT.CANCEL) {
return;
}
}
try {
wFields.table.removeAll();
FileObject fileObject = textFileList.getFile(0);
fileInputStream = KettleVFS.getInputStream(fileObject);
Table table = wFields.table;
CompressionProvider provider = CompressionProviderFactory.getInstance().createCompressionProviderInstance(meta.getFileCompression());
inputStream = provider.createInputStream(fileInputStream);
InputStreamReader reader;
if (meta.getEncoding() != null && meta.getEncoding().length() > 0) {
reader = new InputStreamReader(inputStream, meta.getEncoding());
} else {
reader = new InputStreamReader(inputStream);
}
EncodingType encodingType = EncodingType.guessEncodingType(reader.getEncoding());
if (clearFields == SWT.YES || !meta.hasHeader() || nrInputFields > 0) {
// Scan the header-line, determine fields...
String line;
if (meta.hasHeader() || meta.getInputFields().length == 0) {
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineStringBuilder);
if (line != null) {
// Estimate the number of input fields...
// Chop up the line using the delimiter
String[] fields = TextFileInput.guessStringsFromLine(transMeta, log, line, meta, delimiter, enclosure, escapeCharacter);
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
if (field == null || field.length() == 0 || (nrInputFields == 0 && !meta.hasHeader())) {
field = "Field" + (i + 1);
} else {
// Trim the field
field = Const.trim(field);
// Replace all spaces & - with underscore _
field = Const.replace(field, " ", "_");
field = Const.replace(field, "-", "_");
}
TableItem item = new TableItem(table, SWT.NONE);
item.setText(1, field);
// The default type is String...
item.setText(2, "String");
}
wFields.setRowNums();
wFields.optWidth(true);
// Copy it...
getInfo(meta);
}
}
// Sample a few lines to determine the correct type of the fields...
String shellText = BaseMessages.getString(PKG, "TextFileInputDialog.LinesToSample.DialogTitle");
String lineText = BaseMessages.getString(PKG, "TextFileInputDialog.LinesToSample.DialogMessage");
EnterNumberDialog end = new EnterNumberDialog(shell, 100, shellText, lineText);
int samples = end.open();
if (samples >= 0) {
getInfo(meta);
TextFileCSVImportProgressDialog pd = new TextFileCSVImportProgressDialog(shell, meta, transMeta, reader, samples, clearFields == SWT.YES);
String message = pd.open();
if (message != null) {
wFields.removeAll();
// OK, what's the result of our search?
getData(meta);
//
if (clearFields == SWT.NO) {
getFieldsData(previousMeta, true);
wFields.table.setSelection(previousMeta.getInputFields().length, wFields.table.getItemCount() - 1);
}
wFields.removeEmptyRows();
wFields.setRowNums();
wFields.optWidth(true);
EnterTextDialog etd = new EnterTextDialog(shell, BaseMessages.getString(PKG, "TextFileInputDialog.ScanResults.DialogTitle"), BaseMessages.getString(PKG, "TextFileInputDialog.ScanResults.DialogMessage"), message, true);
etd.setReadOnly();
etd.open();
}
}
} else {
MessageBox mb = new MessageBox(shell, SWT.OK | SWT.ICON_ERROR);
mb.setMessage(BaseMessages.getString(PKG, "TextFileInputDialog.UnableToReadHeaderLine.DialogMessage"));
mb.setText(BaseMessages.getString(PKG, "System.Dialog.Error.Title"));
mb.open();
}
} catch (IOException e) {
new ErrorDialog(shell, BaseMessages.getString(PKG, "TextFileInputDialog.IOError.DialogTitle"), BaseMessages.getString(PKG, "TextFileInputDialog.IOError.DialogMessage"), e);
} catch (KettleException e) {
new ErrorDialog(shell, BaseMessages.getString(PKG, "System.Dialog.Error.Title"), BaseMessages.getString(PKG, "TextFileInputDialog.ErrorGettingFileDesc.DialogMessage"), e);
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
} catch (Exception e) {
// Ignore errors
}
}
} else {
MessageBox mb = new MessageBox(shell, SWT.OK | SWT.ICON_ERROR);
mb.setMessage(BaseMessages.getString(PKG, "TextFileInputDialog.NoValidFileFound.DialogMessage"));
mb.setText(BaseMessages.getString(PKG, "System.Dialog.Error.Title"));
mb.open();
}
}
use of org.pentaho.di.trans.steps.textfileinput.EncodingType in project pentaho-kettle by pentaho.
the class CsvInputDialog method getCSV.
// Get the data layout
private void getCSV() {
InputStream inputStream = null;
try {
CsvInputMeta meta = new CsvInputMeta();
getInfo(meta);
String filename = transMeta.environmentSubstitute(meta.getFilename());
String delimiter = transMeta.environmentSubstitute(meta.getDelimiter());
String enclosure = transMeta.environmentSubstitute(meta.getEnclosure());
FileObject fileObject = KettleVFS.getFileObject(filename);
if (!(fileObject instanceof LocalFile)) {
//
throw new KettleException(BaseMessages.getString(PKG, "CsvInput.Log.OnlyLocalFilesAreSupported"));
}
wFields.table.removeAll();
inputStream = KettleVFS.getInputStream(fileObject);
String realEncoding = transMeta.environmentSubstitute(meta.getEncoding());
InputStreamReader reader;
if (Utils.isEmpty(realEncoding)) {
reader = new InputStreamReader(inputStream);
} else {
reader = new InputStreamReader(inputStream, realEncoding);
}
EncodingType encodingType = EncodingType.guessEncodingType(reader.getEncoding());
// Read a line of data to determine the number of rows...
//
String line = TextFileInput.getLine(log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_UNIX, new StringBuilder(1000));
// Split the string, header or data into parts...
//
String[] fieldNames = CsvInput.guessStringsFromLine(log, line, delimiter, enclosure, meta.getEscapeCharacter());
if (!meta.isHeaderPresent()) {
// Don't use field names from the header...
// Generate field names F1 ... F10
//
DecimalFormat df = new DecimalFormat("000");
for (int i = 0; i < fieldNames.length; i++) {
fieldNames[i] = "Field_" + df.format(i);
}
} else {
if (!Utils.isEmpty(meta.getEnclosure())) {
for (int i = 0; i < fieldNames.length; i++) {
if (fieldNames[i].startsWith(meta.getEnclosure()) && fieldNames[i].endsWith(meta.getEnclosure()) && fieldNames[i].length() > 1) {
fieldNames[i] = fieldNames[i].substring(1, fieldNames[i].length() - 1);
}
}
}
}
//
for (int i = 0; i < fieldNames.length; i++) {
fieldNames[i] = Const.trim(fieldNames[i]);
}
//
for (int i = 0; i < fieldNames.length; i++) {
TableItem item = new TableItem(wFields.table, SWT.NONE);
item.setText(1, fieldNames[i]);
item.setText(2, ValueMetaFactory.getValueMetaName(ValueMetaInterface.TYPE_STRING));
}
wFields.removeEmptyRows();
wFields.setRowNums();
wFields.optWidth(true);
// Now we can continue reading the rows of data and we can guess the
// Sample a few lines to determine the correct type of the fields...
//
String shellText = BaseMessages.getString(PKG, "CsvInputDialog.LinesToSample.DialogTitle");
String lineText = BaseMessages.getString(PKG, "CsvInputDialog.LinesToSample.DialogMessage");
EnterNumberDialog end = new EnterNumberDialog(shell, 100, shellText, lineText);
int samples = end.open();
if (samples >= 0) {
getInfo(meta);
TextFileCSVImportProgressDialog pd = new TextFileCSVImportProgressDialog(shell, meta, transMeta, reader, samples, true);
String message = pd.open();
if (message != null) {
wFields.removeAll();
// OK, what's the result of our search?
getData(meta, false);
wFields.removeEmptyRows();
wFields.setRowNums();
wFields.optWidth(true);
EnterTextDialog etd = new EnterTextDialog(shell, BaseMessages.getString(PKG, "CsvInputDialog.ScanResults.DialogTitle"), BaseMessages.getString(PKG, "CsvInputDialog.ScanResults.DialogMessage"), message, true);
etd.setReadOnly();
etd.open();
// asyncUpdatePreview();
}
}
} catch (IOException e) {
new ErrorDialog(shell, BaseMessages.getString(PKG, "CsvInputDialog.IOError.DialogTitle"), BaseMessages.getString(PKG, "CsvInputDialog.IOError.DialogMessage"), e);
} catch (KettleException e) {
new ErrorDialog(shell, BaseMessages.getString(PKG, "System.Dialog.Error.Title"), BaseMessages.getString(PKG, "CsvInputDialog.ErrorGettingFileDesc.DialogMessage"), e);
} finally {
try {
inputStream.close();
} catch (Exception e) {
// Ignore close errors
}
}
}
use of org.pentaho.di.trans.steps.textfileinput.EncodingType in project pentaho-kettle by pentaho.
the class ParGzipCsvInputDialog method getCSV.
// Get the data layout
private void getCSV() {
InputStream inputStream = null;
try {
ParGzipCsvInputMeta meta = new ParGzipCsvInputMeta();
getInfo(meta);
String filename = transMeta.environmentSubstitute(meta.getFilename());
FileObject fileObject = KettleVFS.getFileObject(filename);
if (!(fileObject instanceof LocalFile)) {
//
throw new KettleException(BaseMessages.getString(PKG, "ParGzipCsvInput.Log.OnlyLocalFilesAreSupported"));
}
wFields.table.removeAll();
inputStream = new GZIPInputStream(KettleVFS.getInputStream(fileObject));
InputStreamReader reader = new InputStreamReader(inputStream);
EncodingType encodingType = EncodingType.guessEncodingType(reader.getEncoding());
// Read a line of data to determine the number of rows...
//
String line = TextFileInput.getLine(log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_MIXED, new StringBuilder(1000));
// Split the string, header or data into parts...
//
String[] fieldNames = Const.splitString(line, meta.getDelimiter());
if (!meta.isHeaderPresent()) {
// Don't use field names from the header...
// Generate field names F1 ... F10
//
DecimalFormat df = new DecimalFormat("000");
for (int i = 0; i < fieldNames.length; i++) {
fieldNames[i] = "Field_" + df.format(i);
}
} else {
if (!Utils.isEmpty(meta.getEnclosure())) {
for (int i = 0; i < fieldNames.length; i++) {
if (fieldNames[i].startsWith(meta.getEnclosure()) && fieldNames[i].endsWith(meta.getEnclosure()) && fieldNames[i].length() > 1) {
fieldNames[i] = fieldNames[i].substring(1, fieldNames[i].length() - 1);
}
}
}
}
//
for (int i = 0; i < fieldNames.length; i++) {
fieldNames[i] = Const.trim(fieldNames[i]);
}
//
for (int i = 0; i < fieldNames.length; i++) {
TableItem item = new TableItem(wFields.table, SWT.NONE);
item.setText(1, fieldNames[i]);
item.setText(2, ValueMetaFactory.getValueMetaName(ValueMetaInterface.TYPE_STRING));
}
wFields.removeEmptyRows();
wFields.setRowNums();
wFields.optWidth(true);
// Now we can continue reading the rows of data and we can guess the
// Sample a few lines to determine the correct type of the fields...
//
String shellText = BaseMessages.getString(PKG, "ParGzipCsvInputDialog.LinesToSample.DialogTitle");
String lineText = BaseMessages.getString(PKG, "ParGzipCsvInputDialog.LinesToSample.DialogMessage");
EnterNumberDialog end = new EnterNumberDialog(shell, 100, shellText, lineText);
int samples = end.open();
if (samples >= 0) {
getInfo(meta);
TextFileCSVImportProgressDialog pd = new TextFileCSVImportProgressDialog(shell, meta, transMeta, reader, samples, true);
String message = pd.open();
if (message != null) {
wFields.removeAll();
// OK, what's the result of our search?
getData(meta);
wFields.removeEmptyRows();
wFields.setRowNums();
wFields.optWidth(true);
EnterTextDialog etd = new EnterTextDialog(shell, BaseMessages.getString(PKG, "ParGzipCsvInputDialog.ScanResults.DialogTitle"), BaseMessages.getString(PKG, "ParGzipCsvInputDialog.ScanResults.DialogMessage"), message, true);
etd.setReadOnly();
etd.open();
}
}
} catch (IOException e) {
new ErrorDialog(shell, BaseMessages.getString(PKG, "ParGzipCsvInputDialog.IOError.DialogTitle"), BaseMessages.getString(PKG, "ParGzipCsvInputDialog.IOError.DialogMessage"), e);
} catch (KettleException e) {
new ErrorDialog(shell, BaseMessages.getString(PKG, "System.Dialog.Error.Title"), BaseMessages.getString(PKG, "ParGzipCsvInputDialog.ErrorGettingFileDesc.DialogMessage"), e);
} finally {
try {
inputStream.close();
} catch (Exception e) {
// Ignore errors
}
}
}
Aggregations