use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class AbstractFillWith method applyOnColumn.
// TODO : utility Overriden methdo WTF
public void applyOnColumn(DataSetRow row, ActionContext context) {
final Map<String, String> parameters = context.getParameters();
final String columnId = context.getColumnId();
final ColumnMetadata columnMetadata = context.getRowMetadata().getById(columnId);
if (shouldBeProcessed(row, columnId)) {
String newValue;
// First, get raw new value regarding mode (constant or other column):
if (parameters.get(MODE_PARAMETER).equals(CONSTANT_MODE)) {
newValue = parameters.get(DEFAULT_VALUE_PARAMETER);
} else {
final RowMetadata rowMetadata = context.getRowMetadata();
final ColumnMetadata selectedColumn = rowMetadata.getById(parameters.get(SELECTED_COLUMN_PARAMETER));
newValue = row.get(selectedColumn.getId());
}
// Second: if we're on a date column, format new value with the most frequent pattern of the column:
Type type = columnMetadata == null ? Type.ANY : Type.get(columnMetadata.getType());
if (type.equals(Type.DATE)) {
try {
final LocalDateTime date = Providers.get().parse(newValue, columnMetadata);
final String mostUsedDatePattern = RowMetadataUtils.getMostUsedDatePattern(columnMetadata);
DateTimeFormatter ourNiceFormatter = mostUsedDatePattern == null ? DEFAULT_FORMATTER : new DatePattern(mostUsedDatePattern).getFormatter();
newValue = ourNiceFormatter.format(date);
} catch (DateTimeException e) {
// Nothing to do, if we can't get a valid pattern, keep the raw value
LOGGER.debug("Unable to parse date {}.", row.get(columnId), e);
}
}
// At the end, set the new value:
row.set(ActionsUtils.getTargetColumnId(context), newValue);
}
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class CSVFastHeaderAndTypeAnalyzer method analyze.
/**
* Performs header and typing analysis.
*/
public void analyze() {
// no need to do the job twice
if (analysisPerformed) {
return;
}
// Perform Header analysis if the sample has at least two lines
if (sampleLines.size() > 1) {
// first line is not a header
if (!separator.getCountPerLine().containsKey(1) && !separator.getCountPerLine().isEmpty()) {
headerInfoReliable = true;
firstLineAHeader = false;
} else {
final List<Type> firstRecordTypes = firstRecordTyping();
final List<Type> columnTypingWithoutFirstRecord = columnTypingWithoutFirstRecord();
// mark the separator as having a header
if (firstRecordTypes.contains(Type.INTEGER) || firstRecordTypes.contains(Type.DOUBLE) || firstRecordTypes.contains(Type.BOOLEAN)) {
firstLineAHeader = false;
headerInfoReliable = true;
} else if (allStringTypes(firstRecordTypes) && !sampleTypes[0].contains(ABSENT) && (columnTypingWithoutFirstRecord.contains(Type.INTEGER) || columnTypingWithoutFirstRecord.contains(Type.DOUBLE) || columnTypingWithoutFirstRecord.contains(Type.BOOLEAN))) {
firstLineAHeader = true;
headerInfoReliable = true;
}
}
} else {
firstLineAHeader = false;
}
// type analysis: if there is a header the first line is excluded from type analysis, otherwise it is
// included
headers = new ArrayList<>();
if (firstLineAHeader) {
List<Type> columnTypes = columnTypingWithoutFirstRecord();
List<String> firstLine = readLine(sampleLines.get(0));
int i = 0;
for (String field : firstLine) {
headers.add(new Pair<>(field, columnTypes.get(i++)));
}
} else {
List<Type> columnTypes = allRecordsColumnTyping();
int i = 1;
for (Type type : columnTypes) {
headers.add(new Pair<>(message("import.local.generated_column_name", i++), type));
}
}
analysisPerformed = true;
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class StatisticsAdapter method injectDataTypeAnalysis.
private void injectDataTypeAnalysis(final ColumnMetadata column, final Analyzers.Result result) {
if (result.exist(DataTypeOccurences.class) && !column.isTypeForced()) {
final DataTypeOccurences dataType = result.get(DataTypeOccurences.class);
final DataTypeEnum suggestedEnumType = dataType.getSuggestedType();
final Type suggestedColumnType = Type.get(suggestedEnumType.name());
// the suggested type can be modified by #injectValueQuality
column.setType(suggestedColumnType.getName());
}
injectValueQuality(column, result);
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class XlsSchemaParser method parsePerSheet.
/**
* Return the columns metadata for the given sheet.
*
* @param sheet the sheet to look at.
* @param datasetId the dataset id.
* @return the columns metadata for the given sheet.
*/
private List<ColumnMetadata> parsePerSheet(Sheet sheet, String datasetId, FormulaEvaluator formulaEvaluator) {
LOGGER.debug(Markers.dataset(datasetId), "parsing sheet '{}'", sheet.getSheetName());
// Map<ColId, Map<RowId, type>>
SortedMap<Integer, SortedMap<Integer, String>> cellsTypeMatrix = collectSheetTypeMatrix(sheet, formulaEvaluator);
int averageHeaderSize = guessHeaderSize(cellsTypeMatrix);
// here we have information regarding types for each rows/col (yup a Matrix!! :-) )
// so we can analyse and guess metadata (column type, header value)
final List<ColumnMetadata> columnsMetadata = new ArrayList<>(cellsTypeMatrix.size());
cellsTypeMatrix.forEach((colId, typePerRowMap) -> {
Type type = guessColumnType(colId, typePerRowMap, averageHeaderSize);
String headerText = null;
if (averageHeaderSize == 1 && sheet.getRow(0) != null) {
// so header value is the first row of the column
Cell headerCell = sheet.getRow(0).getCell(colId);
headerText = XlsUtils.getCellValueAsString(headerCell, formulaEvaluator);
}
// header text cannot be null so use a default one
if (StringUtils.isEmpty(headerText)) {
// +1 because it starts from 0
headerText = message("import.local.generated_column_name", colId + 1);
}
// FIXME what do we do if header size is > 1 concat all lines?
columnsMetadata.add(//
ColumnMetadata.Builder.column().headerSize(//
averageHeaderSize).name(//
headerText).type(//
type).build());
});
return columnsMetadata;
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class ContentAnalysisTest method createMetadata.
private DataSetMetadata createMetadata(String id, List<String> header) {
final DataSetMetadata metadata = metadataBuilder.metadata().id(id).build();
List<ColumnMetadata> columns = header.stream().map(s -> ColumnMetadata.Builder.column().name(s).type(Type.STRING).build()).collect(Collectors.toList());
metadata.setRowMetadata(new RowMetadata(columns));
return metadata;
}
Aggregations