Search in sources :

Example 6 with ColumnMetadata

use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.

the class ActionsUtils method createNewColumnsImpl.

private static Map<String, String> createNewColumnsImpl(ActionContext context, List<AdditionalColumn> additionalColumns, String columnId, RowMetadata rowMetadata) {
    final Map<String, String> cols = new HashMap<>();
    // id of the column to put the new one after, initially the current column
    String nextId = columnId;
    for (AdditionalColumn additionalColumn : additionalColumns) {
        ColumnMetadata.Builder brandNewColumnBuilder = ColumnMetadata.Builder.column();
        // it's often important to copy the original column type for the action which needs statistics
        if (additionalColumn.getCopyMetadataFromId() != null) {
            ColumnMetadata newColumn = context.getRowMetadata().getById(additionalColumn.getCopyMetadataFromId());
            brandNewColumnBuilder.copy(newColumn).computedId(StringUtils.EMPTY);
            brandNewColumnBuilder.type(Type.get(newColumn.getType()));
        } else {
            brandNewColumnBuilder.type(additionalColumn.getType());
        }
        brandNewColumnBuilder.name(additionalColumn.getName());
        ColumnMetadata columnMetadata = brandNewColumnBuilder.build();
        rowMetadata.insertAfter(nextId, columnMetadata);
        // the new column to put next one after, is the fresh new one
        nextId = columnMetadata.getId();
        cols.put(additionalColumn.getKey(), columnMetadata.getId());
    }
    return cols;
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) HashMap(java.util.HashMap)

Example 7 with ColumnMetadata

use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.

the class ClearMatching method toClear.

@Override
public boolean toClear(DataSetRow dataSetRow, String columnId, ActionContext actionContext) {
    final Map<String, String> parameters = actionContext.getParameters();
    final RowMetadata rowMetadata = actionContext.getRowMetadata();
    final ColumnMetadata columnMetadata = rowMetadata.getById(columnId);
    final String value = dataSetRow.get(columnId);
    final String equalsValue = parameters.get(VALUE_PARAMETER);
    if (Type.get(columnMetadata.getType()) == Type.BOOLEAN) {
        // for boolean we can accept True equalsIgnoreCase true
        return StringUtils.equalsIgnoreCase(value, equalsValue);
    } else {
        ReplaceOnValueHelper replaceOnValueHelper = new ReplaceOnValueHelper().build(equalsValue, true);
        return replaceOnValueHelper.matches(value);
    }
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) ReplaceOnValueHelper(org.talend.dataprep.transformation.actions.common.ReplaceOnValueHelper) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata)

Example 8 with ColumnMetadata

use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.

the class Concat method getAdditionalColumns.

protected List<ActionsUtils.AdditionalColumn> getAdditionalColumns(ActionContext context) {
    String result;
    ColumnMetadata selectedColumn = context.getRowMetadata().getById(context.getParameters().get(SELECTED_COLUMN_PARAMETER));
    String sourceColumnName = context.getColumnName();
    final Map<String, String> parameters = context.getParameters();
    final String prefix = getParameter(parameters, PREFIX_PARAMETER, StringUtils.EMPTY);
    final String suffix = getParameter(parameters, SUFFIX_PARAMETER, StringUtils.EMPTY);
    if (parameters.get(MODE_PARAMETER).equals(OTHER_COLUMN_MODE)) {
        result = sourceColumnName + COLUMN_NAMES_SEPARATOR + selectedColumn.getName();
    } else {
        result = prefix + sourceColumnName + suffix;
    }
    return singletonList(ActionsUtils.additionalColumn().withName(result));
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata)

Example 9 with ColumnMetadata

use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.

the class MakeLineHeader method setRemainingRowColumnsNames.

private void setRemainingRowColumnsNames(ActionContext context) {
    for (ColumnMetadata column : context.getRowMetadata().getColumns()) {
        if (!context.has(column.getId())) {
            // Action hasn't yet found new headers
            break;
        }
        String newColumnName = context.get(column.getId());
        column.setName(newColumnName);
    }
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata)

Example 10 with ColumnMetadata

use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.

the class StatisticsAdapter method injectSemanticTypes.

private void injectSemanticTypes(final ColumnMetadata column, final Analyzers.Result result) {
    if (result.exist(SemanticType.class) && !column.isDomainForced()) {
        final SemanticType semanticType = result.get(SemanticType.class);
        final List<CategoryFrequency> suggestedTypes = semanticType.getSuggestedCategories();
        // TDP-471: Don't pick semantic type if lower than a threshold.
        final Optional<CategoryFrequency> bestMatch = // 
        suggestedTypes.stream().filter(// 
        e -> !e.getCategoryName().isEmpty()).findFirst();
        if (bestMatch.isPresent()) {
            // TODO (TDP-734) Take into account limit of the semantic analyzer.
            final float score = bestMatch.get().getScore();
            if (score > semanticThreshold) {
                updateMetadataWithCategoryInfo(column, bestMatch.get());
            } else {
                // Ensure the domain is cleared if score is lower than threshold (earlier analysis - e.g.
                // on the first 20 lines - may be over threshold, but full scan may decide otherwise.
                resetDomain(column);
            }
        } else if (StringUtils.isNotEmpty(column.getDomain())) {
            // Column *had* a domain but seems like new analysis removed it.
            resetDomain(column);
        }
        // Keep all suggested semantic categories in the column metadata
        List<SemanticDomain> semanticDomains = // 
        suggestedTypes.stream().map(// 
        this::toSemanticDomain).filter(// 
        semanticDomain -> semanticDomain != null && semanticDomain.getScore() >= 1).limit(// 
        10).collect(Collectors.toList());
        column.setSemanticDomains(semanticDomains);
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) java.util(java.util) StringUtils(org.apache.commons.lang.StringUtils) DateHistogram(org.talend.dataprep.api.dataset.statistics.date.DateHistogram) CardinalityStatistics(org.talend.dataquality.statistics.cardinality.CardinalityStatistics) TypeUtils(org.talend.dataprep.api.type.TypeUtils) DataTypeFrequencyStatistics(org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics) LoggerFactory(org.slf4j.LoggerFactory) Quality(org.talend.dataprep.api.dataset.Quality) StreamNumberHistogramStatistics(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramStatistics) NumberFormat(java.text.NumberFormat) org.talend.dataprep.api.dataset.statistics(org.talend.dataprep.api.dataset.statistics) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) ValueQualityStatistics(org.talend.dataquality.common.inference.ValueQualityStatistics) SummaryStatistics(org.talend.dataquality.statistics.numeric.summary.SummaryStatistics) CategoryFrequency(org.talend.dataquality.semantic.recognizer.CategoryFrequency) SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) NumberHistogram(org.talend.dataprep.api.dataset.statistics.number.NumberHistogram) Logger(org.slf4j.Logger) Predicate(java.util.function.Predicate) DQCategory(org.talend.dataquality.semantic.model.DQCategory) DecimalFormat(java.text.DecimalFormat) StreamDateHistogramStatistics(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics) TextLengthStatistics(org.talend.dataquality.statistics.text.TextLengthStatistics) Collectors(java.util.stream.Collectors) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics) Type(org.talend.dataprep.api.type.Type) CategoryRegistryManager(org.talend.dataquality.semantic.api.CategoryRegistryManager) QuantileStatistics(org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DataTypeOccurences(org.talend.dataquality.statistics.type.DataTypeOccurences) SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) CategoryFrequency(org.talend.dataquality.semantic.recognizer.CategoryFrequency)

Aggregations

ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)320 Test (org.junit.Test)217 AbstractMetadataBaseTest (org.talend.dataprep.transformation.actions.AbstractMetadataBaseTest)115 RowMetadata (org.talend.dataprep.api.dataset.RowMetadata)86 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)80 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)48 InputStream (java.io.InputStream)25 Type (org.talend.dataprep.api.type.Type)23 DataSetBaseTest (org.talend.dataprep.dataset.DataSetBaseTest)22 ArrayList (java.util.ArrayList)19 HashMap (java.util.HashMap)17 IOException (java.io.IOException)14 TDPException (org.talend.dataprep.exception.TDPException)14 Schema (org.talend.dataprep.schema.Schema)14 Autowired (org.springframework.beans.factory.annotation.Autowired)13 Logger (org.slf4j.Logger)12 LoggerFactory (org.slf4j.LoggerFactory)12 SemanticDomain (org.talend.dataprep.api.dataset.statistics.SemanticDomain)12 DataSetServiceTest (org.talend.dataprep.dataset.service.DataSetServiceTest)11 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)10