Search in sources :

Example 16 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class AbstractFillWith method applyOnColumn.

// TODO : utility Overriden methdo WTF
public void applyOnColumn(DataSetRow row, ActionContext context) {
    final Map<String, String> parameters = context.getParameters();
    final String columnId = context.getColumnId();
    final ColumnMetadata columnMetadata = context.getRowMetadata().getById(columnId);
    if (shouldBeProcessed(row, columnId)) {
        String newValue;
        // First, get raw new value regarding mode (constant or other column):
        if (parameters.get(MODE_PARAMETER).equals(CONSTANT_MODE)) {
            newValue = parameters.get(DEFAULT_VALUE_PARAMETER);
        } else {
            final RowMetadata rowMetadata = context.getRowMetadata();
            final ColumnMetadata selectedColumn = rowMetadata.getById(parameters.get(SELECTED_COLUMN_PARAMETER));
            newValue = row.get(selectedColumn.getId());
        }
        // Second: if we're on a date column, format new value with the most frequent pattern of the column:
        Type type = columnMetadata == null ? Type.ANY : Type.get(columnMetadata.getType());
        if (type.equals(Type.DATE)) {
            try {
                final LocalDateTime date = Providers.get().parse(newValue, columnMetadata);
                final String mostUsedDatePattern = RowMetadataUtils.getMostUsedDatePattern(columnMetadata);
                DateTimeFormatter ourNiceFormatter = mostUsedDatePattern == null ? DEFAULT_FORMATTER : new DatePattern(mostUsedDatePattern).getFormatter();
                newValue = ourNiceFormatter.format(date);
            } catch (DateTimeException e) {
                // Nothing to do, if we can't get a valid pattern, keep the raw value
                LOGGER.debug("Unable to parse date {}.", row.get(columnId), e);
            }
        }
        // At the end, set the new value:
        row.set(ActionsUtils.getTargetColumnId(context), newValue);
    }
}
Also used : LocalDateTime(java.time.LocalDateTime) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) ParameterType(org.talend.dataprep.parameters.ParameterType) Type(org.talend.dataprep.api.type.Type) DateTimeException(java.time.DateTimeException) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata) DateTimeFormatter(java.time.format.DateTimeFormatter) DatePattern(org.talend.dataprep.transformation.actions.date.DatePattern)

Example 17 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class CSVFastHeaderAndTypeAnalyzer method analyze.

/**
 * Performs header and typing analysis.
 */
public void analyze() {
    // no need to do the job twice
    if (analysisPerformed) {
        return;
    }
    // Perform Header analysis if the sample has at least two lines
    if (sampleLines.size() > 1) {
        // first line is not a header
        if (!separator.getCountPerLine().containsKey(1) && !separator.getCountPerLine().isEmpty()) {
            headerInfoReliable = true;
            firstLineAHeader = false;
        } else {
            final List<Type> firstRecordTypes = firstRecordTyping();
            final List<Type> columnTypingWithoutFirstRecord = columnTypingWithoutFirstRecord();
            // mark the separator as having a header
            if (firstRecordTypes.contains(Type.INTEGER) || firstRecordTypes.contains(Type.DOUBLE) || firstRecordTypes.contains(Type.BOOLEAN)) {
                firstLineAHeader = false;
                headerInfoReliable = true;
            } else if (allStringTypes(firstRecordTypes) && !sampleTypes[0].contains(ABSENT) && (columnTypingWithoutFirstRecord.contains(Type.INTEGER) || columnTypingWithoutFirstRecord.contains(Type.DOUBLE) || columnTypingWithoutFirstRecord.contains(Type.BOOLEAN))) {
                firstLineAHeader = true;
                headerInfoReliable = true;
            }
        }
    } else {
        firstLineAHeader = false;
    }
    // type analysis: if there is a header the first line is excluded from type analysis, otherwise it is
    // included
    headers = new ArrayList<>();
    if (firstLineAHeader) {
        List<Type> columnTypes = columnTypingWithoutFirstRecord();
        List<String> firstLine = readLine(sampleLines.get(0));
        int i = 0;
        for (String field : firstLine) {
            headers.add(new Pair<>(field, columnTypes.get(i++)));
        }
    } else {
        List<Type> columnTypes = allRecordsColumnTyping();
        int i = 1;
        for (Type type : columnTypes) {
            headers.add(new Pair<>(message("import.local.generated_column_name", i++), type));
        }
    }
    analysisPerformed = true;
}
Also used : Type(org.talend.dataprep.api.type.Type)

Example 18 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class StatisticsAdapter method injectDataTypeAnalysis.

private void injectDataTypeAnalysis(final ColumnMetadata column, final Analyzers.Result result) {
    if (result.exist(DataTypeOccurences.class) && !column.isTypeForced()) {
        final DataTypeOccurences dataType = result.get(DataTypeOccurences.class);
        final DataTypeEnum suggestedEnumType = dataType.getSuggestedType();
        final Type suggestedColumnType = Type.get(suggestedEnumType.name());
        // the suggested type can be modified by #injectValueQuality
        column.setType(suggestedColumnType.getName());
    }
    injectValueQuality(column, result);
}
Also used : SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) Type(org.talend.dataprep.api.type.Type) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) DataTypeOccurences(org.talend.dataquality.statistics.type.DataTypeOccurences)

Example 19 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class XlsSchemaParser method parsePerSheet.

/**
 * Return the columns metadata for the given sheet.
 *
 * @param sheet the sheet to look at.
 * @param datasetId the dataset id.
 * @return the columns metadata for the given sheet.
 */
private List<ColumnMetadata> parsePerSheet(Sheet sheet, String datasetId, FormulaEvaluator formulaEvaluator) {
    LOGGER.debug(Markers.dataset(datasetId), "parsing sheet '{}'", sheet.getSheetName());
    // Map<ColId, Map<RowId, type>>
    SortedMap<Integer, SortedMap<Integer, String>> cellsTypeMatrix = collectSheetTypeMatrix(sheet, formulaEvaluator);
    int averageHeaderSize = guessHeaderSize(cellsTypeMatrix);
    // here we have information regarding types for each rows/col (yup a Matrix!! :-) )
    // so we can analyse and guess metadata (column type, header value)
    final List<ColumnMetadata> columnsMetadata = new ArrayList<>(cellsTypeMatrix.size());
    cellsTypeMatrix.forEach((colId, typePerRowMap) -> {
        Type type = guessColumnType(colId, typePerRowMap, averageHeaderSize);
        String headerText = null;
        if (averageHeaderSize == 1 && sheet.getRow(0) != null) {
            // so header value is the first row of the column
            Cell headerCell = sheet.getRow(0).getCell(colId);
            headerText = XlsUtils.getCellValueAsString(headerCell, formulaEvaluator);
        }
        // header text cannot be null so use a default one
        if (StringUtils.isEmpty(headerText)) {
            // +1 because it starts from 0
            headerText = message("import.local.generated_column_name", colId + 1);
        }
        // FIXME what do we do if header size is > 1 concat all lines?
        columnsMetadata.add(// 
        ColumnMetadata.Builder.column().headerSize(// 
        averageHeaderSize).name(// 
        headerText).type(// 
        type).build());
    });
    return columnsMetadata;
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) Type(org.talend.dataprep.api.type.Type)

Example 20 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class ContentAnalysisTest method createMetadata.

private DataSetMetadata createMetadata(String id, List<String> header) {
    final DataSetMetadata metadata = metadataBuilder.metadata().id(id).build();
    List<ColumnMetadata> columns = header.stream().map(s -> ColumnMetadata.Builder.column().name(s).type(Type.STRING).build()).collect(Collectors.toList());
    metadata.setRowMetadata(new RowMetadata(columns));
    return metadata;
}
Also used : CoreMatchers(org.hamcrest.CoreMatchers) Arrays(java.util.Arrays) LocalStoreLocation(org.talend.dataprep.api.dataset.location.LocalStoreLocation) Autowired(org.springframework.beans.factory.annotation.Autowired) ReflectionTestUtils(org.springframework.test.util.ReflectionTestUtils) Test(org.junit.Test) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) List(java.util.List) Type(org.talend.dataprep.api.type.Type) DataSetBaseTest(org.talend.dataprep.dataset.DataSetBaseTest) CSVFormatFamily(org.talend.dataprep.schema.csv.CSVFormatFamily) Optional(java.util.Optional) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) Assert(org.junit.Assert) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata)

Aggregations

Type (org.talend.dataprep.api.type.Type)24 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)21 Test (org.junit.Test)17 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)14 DataSetBaseTest (org.talend.dataprep.dataset.DataSetBaseTest)13 DataSetServiceTest (org.talend.dataprep.dataset.service.DataSetServiceTest)12 Arrays (java.util.Arrays)4 List (java.util.List)4 Optional (java.util.Optional)3 StringUtils (org.apache.commons.lang.StringUtils)3 Assert.assertEquals (org.junit.Assert.assertEquals)3 Builder.column (org.talend.dataprep.api.dataset.ColumnMetadata.Builder.column)3 RowMetadata (org.talend.dataprep.api.dataset.RowMetadata)3 PatternFrequency (org.talend.dataprep.api.dataset.statistics.PatternFrequency)3 IOException (java.io.IOException)2 Collections (java.util.Collections)2 HashMap (java.util.HashMap)2 Locale (java.util.Locale)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2