use of org.talend.dataquality.record.linkage.analyzer.StringsClusterAnalyzer in project data-prep by Talend.
the class ClusterParameters method getParameters.
@Override
public GenericParameter getParameters(final String columnId, final DataSet content) {
// Analyze clusters service
StringsClusterAnalyzer clusterAnalyzer = new StringsClusterAnalyzer();
clusterAnalyzer.withPostMerges(new PostMerge(AttributeMatcherType.SOUNDEX, 0.8f));
clusterAnalyzer.init();
content.getRecords().forEach(row -> {
String value = row.get(columnId);
clusterAnalyzer.analyze(value);
});
clusterAnalyzer.end();
// Build results
final Clusters.Builder builder = Clusters.builder().title(DataprepBundle.message("parameter.textclustering.title.1")).title(DataprepBundle.message("parameter.textclustering.title.2"));
final StringClusters result = clusterAnalyzer.getResult().get(0);
for (StringClusters.StringCluster cluster : result) {
// String clustering may cluster null / empty values, however not interesting for data prep.
if (!StringUtils.isEmpty(cluster.survivedValue)) {
final ClusterItem.Builder currentCluster = ClusterItem.builder();
for (String value : cluster.originalValues) {
currentCluster.parameter(new ConstantParameter(value, ParameterType.BOOLEAN));
}
currentCluster.replace(Parameter.parameter(LocaleContextHolder.getLocale()).setName("replaceValue").setType(ParameterType.STRING).setDefaultValue(cluster.survivedValue).build(null));
builder.cluster(currentCluster);
}
}
return new GenericParameter("cluster", builder.build());
}
Aggregations