use of org.apache.hop.core.file.IInputFileMeta in project hop by apache.
the class TextFileCSVImportProgressDialog method doScan.
private String doScan(IProgressMonitor monitor, final boolean failOnParseError) throws HopException {
if (samples > 0) {
monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), samples + 1);
} else {
monitor.beginTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
}
String line = "";
long fileLineNumber = 0;
DecimalFormatSymbols dfs = new DecimalFormatSymbols();
int nrFields = meta.getInputFields().length;
IRowMeta outputRowMeta = new RowMeta();
meta.getFields(outputRowMeta, null, null, null, variables, null);
// Remove the storage meta-data (don't go for lazy conversion during scan)
for (IValueMeta valueMeta : outputRowMeta.getValueMetaList()) {
valueMeta.setStorageMetadata(null);
valueMeta.setStorageType(IValueMeta.STORAGE_TYPE_NORMAL);
}
IRowMeta convertRowMeta = outputRowMeta.cloneToType(IValueMeta.TYPE_STRING);
// How many null values?
// How many times null value?
int[] nrnull = new int[nrFields];
// String info
// min string
String[] minstr = new String[nrFields];
// max string
String[] maxstr = new String[nrFields];
// first occ. of string?
boolean[] firststr = new boolean[nrFields];
// Date info
// is the field perhaps a Date?
boolean[] isDate = new boolean[nrFields];
// How many date formats work?
int[] dateFormatCount = new int[nrFields];
boolean[][] dateFormat = // What are the date formats that
new boolean[nrFields][Const.getDateFormats().length];
// work?
// min date value
Date[][] minDate = new Date[nrFields][Const.getDateFormats().length];
// max date value
Date[][] maxDate = new Date[nrFields][Const.getDateFormats().length];
// Number info
// is the field perhaps a Number?
boolean[] isNumber = new boolean[nrFields];
// How many number formats work?
int[] numberFormatCount = new int[nrFields];
boolean[][] numberFormat = // What are the number format
new boolean[nrFields][Const.getNumberFormats().length];
// that work?
// min number value
double[][] minValue = new double[nrFields][Const.getDateFormats().length];
// max number value
double[][] maxValue = new double[nrFields][Const.getDateFormats().length];
int[][] numberPrecision = // remember the precision?
new int[nrFields][Const.getNumberFormats().length];
int[][] numberLength = // remember the length?
new int[nrFields][Const.getNumberFormats().length];
for (int i = 0; i < nrFields; i++) {
TextFileInputField field = meta.getInputFields()[i];
if (log.isDebug()) {
debug = "init field #" + i;
}
if (replaceMeta) {
// Clear previous info...
field.setName(meta.getInputFields()[i].getName());
field.setType(meta.getInputFields()[i].getType());
field.setFormat("");
field.setLength(-1);
field.setPrecision(-1);
field.setCurrencySymbol(dfs.getCurrencySymbol());
field.setDecimalSymbol("" + dfs.getDecimalSeparator());
field.setGroupSymbol("" + dfs.getGroupingSeparator());
field.setNullString("-");
field.setTrimType(IValueMeta.TRIM_TYPE_NONE);
}
nrnull[i] = 0;
minstr[i] = "";
maxstr[i] = "";
firststr[i] = true;
// Init data guess
isDate[i] = true;
for (int j = 0; j < Const.getDateFormats().length; j++) {
dateFormat[i][j] = true;
minDate[i][j] = Const.MAX_DATE;
maxDate[i][j] = Const.MIN_DATE;
}
dateFormatCount[i] = Const.getDateFormats().length;
// Init number guess
isNumber[i] = true;
for (int j = 0; j < Const.getNumberFormats().length; j++) {
numberFormat[i][j] = true;
minValue[i][j] = Double.MAX_VALUE;
maxValue[i][j] = -Double.MAX_VALUE;
numberPrecision[i][j] = -1;
numberLength[i][j] = -1;
}
numberFormatCount[i] = Const.getNumberFormats().length;
}
IInputFileMeta strinfo = (IInputFileMeta) meta.clone();
for (int i = 0; i < nrFields; i++) {
strinfo.getInputFields()[i].setType(IValueMeta.TYPE_STRING);
}
// Sample <samples> rows...
debug = "get first line";
StringBuilder lineBuffer = new StringBuilder(256);
int fileFormatType = meta.getFileFormatTypeNr();
// If the file has a header we overwrite the first line
// However, if it doesn't have a header, take a new line
//
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
fileLineNumber++;
if (meta.hasHeader()) {
int skipped = 0;
while (line != null && skipped < meta.getNrHeaderLines()) {
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
skipped++;
fileLineNumber++;
}
}
int linenr = 1;
List<StringEvaluator> evaluators = new ArrayList<>();
// Allocate number and date parsers
DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
SimpleDateFormat daf2 = new SimpleDateFormat();
boolean errorFound = false;
while (!errorFound && line != null && (linenr <= samples || samples == 0) && !monitor.isCanceled()) {
monitor.subTask(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
if (samples > 0) {
monitor.worked(1);
}
if (log.isDebug()) {
debug = "convert line #" + linenr + " to row";
}
IRowMeta rowMeta = new RowMeta();
meta.getFields(rowMeta, "transformName", null, null, variables, null);
// Remove the storage meta-data (don't go for lazy conversion during scan)
for (IValueMeta valueMeta : rowMeta.getValueMetaList()) {
valueMeta.setStorageMetadata(null);
valueMeta.setStorageType(IValueMeta.STORAGE_TYPE_NORMAL);
}
String delimiter = variables.resolve(meta.getSeparator());
String enclosure = variables.resolve(meta.getEnclosure());
String escapeCharacter = variables.resolve(meta.getEscapeCharacter());
Object[] r = TextFileInput.convertLineToRow(log, new TextFileLine(line, fileLineNumber, null), strinfo, null, 0, outputRowMeta, convertRowMeta, meta.getFilePaths(variables)[0], rownumber, delimiter, enclosure, escapeCharacter, null, false, false, false, false, false, false, false, false, null, null, false, null, null, null, null, 0, failOnParseError);
if (r == null) {
errorFound = true;
continue;
}
rownumber++;
for (int i = 0; i < nrFields && i < r.length; i++) {
StringEvaluator evaluator;
if (i >= evaluators.size()) {
evaluator = new StringEvaluator(true);
evaluators.add(evaluator);
} else {
evaluator = evaluators.get(i);
}
String string = getStringFromRow(rowMeta, r, i, failOnParseError);
if (i == 0) {
System.out.println();
}
evaluator.evaluateString(string);
}
fileLineNumber++;
if (r != null) {
linenr++;
}
// Grab another line...
//
line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
}
monitor.worked(1);
monitor.setTaskName(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));
// Show information on items using a dialog box
//
StringBuilder message = new StringBuilder();
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));
for (int i = 0; i < nrFields; i++) {
TextFileInputField field = meta.getInputFields()[i];
StringEvaluator evaluator = evaluators.get(i);
List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();
// If we didn't find any matching result, it's a String...
//
StringEvaluationResult result = evaluator.getAdvicedResult();
if (evaluationResults.isEmpty()) {
field.setType(IValueMeta.TYPE_STRING);
field.setLength(evaluator.getMaxLength());
}
if (result != null) {
// Take the first option we find, list the others below...
//
IValueMeta conversionMeta = result.getConversionMeta();
field.setType(conversionMeta.getType());
field.setTrimType(conversionMeta.getTrimType());
field.setFormat(conversionMeta.getConversionMask());
field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
field.setGroupSymbol(conversionMeta.getGroupingSymbol());
field.setLength(conversionMeta.getLength());
field.setPrecision(conversionMeta.getPrecision());
nrnull[i] = result.getNrNull();
minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
}
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));
switch(field.getType()) {
case IValueMeta.TYPE_NUMBER:
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedLength", (field.getLength() < 0 ? "-" : "" + field.getLength())));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.EstimatedPrecision", field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));
if (!evaluationResults.isEmpty()) {
if (evaluationResults.size() > 1) {
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
}
for (StringEvaluationResult seResult : evaluationResults) {
String mask = seResult.getConversionMeta().getConversionMask();
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.TrimType", seResult.getConversionMeta().getTrimType()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMinValue", seResult.getMin()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberMaxValue", seResult.getMax()));
try {
df2.applyPattern(mask);
df2.setDecimalFormatSymbols(dfs2);
double mn = df2.parse(seResult.getMin().toString()).doubleValue();
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberExample", mask, seResult.getMin(), Double.toString(mn)));
} catch (Exception e) {
if (log.isDetailed()) {
log.logDetailed("This is unexpected: parsing [" + seResult.getMin() + "] with format [" + mask + "] did not work.");
}
}
}
}
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
break;
case IValueMeta.TYPE_STRING:
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxLength", "" + field.getLength()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
break;
case IValueMeta.TYPE_DATE:
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxLength", field.getLength() < 0 ? "-" : "" + field.getLength()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
if (dateFormatCount[i] > 1) {
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
}
if (!Utils.isEmpty(minstr[i])) {
for (int x = 0; x < Const.getDateFormats().length; x++) {
if (dateFormat[i][x]) {
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateFormat2", Const.getDateFormats()[x]));
Date mindate = minDate[i][x];
Date maxdate = maxDate[i][x];
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMinValue", mindate.toString()));
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateMaxValue", maxdate.toString()));
daf2.applyPattern(Const.getDateFormats()[x]);
try {
Date md = daf2.parse(minstr[i]);
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateExample", Const.getDateFormats()[x], minstr[i], md.toString()));
} catch (Exception e) {
if (log.isDetailed()) {
log.logDetailed("This is unexpected: parsing [" + minstr[i] + "] with format [" + Const.getDateFormats()[x] + "] did not work.");
}
}
}
}
}
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
break;
default:
break;
}
if (nrnull[i] == linenr - 1) {
message.append(BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
}
message.append(Const.CR);
}
monitor.worked(1);
monitor.done();
return message.toString();
}
use of org.apache.hop.core.file.IInputFileMeta in project hop by apache.
the class TextFileInputTest method convertLineToRowTest.
/**
* @throws Exception
*/
@Test
public void convertLineToRowTest() throws Exception {
ILogChannel log = Mockito.mock(ILogChannel.class);
TextFileLine textFileLine = Mockito.mock(TextFileLine.class);
textFileLine.line = "testData1;testData2;testData3";
IInputFileMeta info = Mockito.mock(IInputFileMeta.class);
TextFileInputField[] textFileInputFields = { new TextFileInputField(), new TextFileInputField(), new TextFileInputField() };
Mockito.doReturn(textFileInputFields).when(info).getInputFields();
Mockito.doReturn("CSV").when(info).getFileType();
Mockito.doReturn("/").when(info).getEscapeCharacter();
Mockito.doReturn(true).when(info).isErrorIgnored();
Mockito.doReturn(true).when(info).isErrorLineSkipped();
IRowMeta outputRowMeta = Mockito.mock(IRowMeta.class);
Mockito.doReturn(15).when(outputRowMeta).size();
IValueMeta valueMetaWithError = Mockito.mock(IValueMeta.class);
Mockito.doThrow(new HopValueException("Error converting")).when(valueMetaWithError).convertDataFromString(Mockito.anyString(), Mockito.any(IValueMeta.class), Mockito.anyString(), Mockito.anyString(), Mockito.anyInt());
Mockito.doReturn(valueMetaWithError).when(outputRowMeta).getValueMeta(Mockito.anyInt());
// it should run without NPE
TextFileInput.convertLineToRow(log, textFileLine, info, new Object[3], 1, outputRowMeta, Mockito.mock(IRowMeta.class), null, 1L, ";", null, "/", Mockito.mock(IFileErrorHandler.class), false, false, false, false, false, false, false, false, null, null, false, new Date(), null, null, null, 1L);
}
Aggregations