use of org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams in project tdq-studio-se by Talend.
the class AnalysisRecordGroupingUtilsTest method testCreateSurvivorShipAlgorithmParams.
/**
* Test method for
* {@link org.talend.dq.analysis.AnalysisRecordGroupingUtils#createSurvivorShipAlgorithmParams(org.talend.dataquality.record.linkage.grouping.AnalysisMatchRecordGrouping, org.talend.dataquality.indicators.columnset.RecordMatchingIndicator, Map)
* .
*/
@Test
public void testCreateSurvivorShipAlgorithmParams() {
// Survivorshipkey
RecordMatchingIndicator recordMatchingIndicator = ColumnsetFactory.eINSTANCE.createRecordMatchingIndicator();
MatchRuleDefinition createMatchRuleDefinition = RulesFactory.eINSTANCE.createMatchRuleDefinition();
recordMatchingIndicator.setBuiltInMatchRuleDefinition(createMatchRuleDefinition);
EList<SurvivorshipKeyDefinition> survivorshipKeys = createMatchRuleDefinition.getSurvivorshipKeys();
AlgorithmDefinition createAlgorithmDefinition = RulesFactory.eINSTANCE.createAlgorithmDefinition();
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmParameters("");
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmType("Longest");
// $NON-NLS-1$
survivorshipKeys.add(createKeyDefinition("a1", createAlgorithmDefinition));
createAlgorithmDefinition = RulesFactory.eINSTANCE.createAlgorithmDefinition();
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmParameters("");
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmType("Longest");
// $NON-NLS-1$
survivorshipKeys.add(createKeyDefinition("a2", createAlgorithmDefinition));
// DefaultSurvivorship
EList<DefaultSurvivorshipDefinition> defaultSurvivorshipDefinitions = createMatchRuleDefinition.getDefaultSurvivorshipDefinitions();
createAlgorithmDefinition = RulesFactory.eINSTANCE.createAlgorithmDefinition();
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmParameters("");
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmType("MostCommon");
// $NON-NLS-1$
defaultSurvivorshipDefinitions.add(createDefaultsurvivShip("String", createAlgorithmDefinition));
// DefaultSurvivorship
EList<ParticularDefaultSurvivorshipDefinitions> particularDefaultSurvivorshipDefinitions = createMatchRuleDefinition.getParticularDefaultSurvivorshipDefinitions();
createAlgorithmDefinition = RulesFactory.eINSTANCE.createAlgorithmDefinition();
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmParameters("");
// $NON-NLS-1$
createAlgorithmDefinition.setAlgorithmType("Concatenate");
particularDefaultSurvivorshipDefinitions.add(createParticularDefaultSurvivorshipDefinitions("a2", // $NON-NLS-1$
createAlgorithmDefinition));
// init columnMap
Map<MetadataColumn, String> columnMap = new HashMap<MetadataColumn, String>();
MetadataColumn col0 = ConnectionFactory.eINSTANCE.createMetadataColumn();
// $NON-NLS-1$
col0.setName("a1");
// $NON-NLS-1$
col0.setTalendType("id_String");
// $NON-NLS-1$
columnMap.put(col0, "0");
MetadataColumn col1 = ConnectionFactory.eINSTANCE.createMetadataColumn();
// $NON-NLS-1$
col1.setTalendType("id_String");
// $NON-NLS-1$
col1.setName("a2");
// $NON-NLS-1$
columnMap.put(col1, "1");
MetadataColumn col2 = ConnectionFactory.eINSTANCE.createMetadataColumn();
// $NON-NLS-1$
col2.setTalendType("id_String");
// $NON-NLS-1$
col2.setName("a3");
// $NON-NLS-1$
columnMap.put(col2, "2");
MatchGroupResultConsumer matchGroupResultConsumer = new MatchGroupResultConsumer(true) {
@Override
public void handle(Object row) {
// no need to implement
}
};
AnalysisMatchRecordGrouping analysisMatchRecordGrouping = new AnalysisMatchRecordGrouping(matchGroupResultConsumer);
CombinedRecordMatcher combinedRecordMatcher = analysisMatchRecordGrouping.getCombinedRecordMatcher();
DQMFBRecordMatcher dqmfbRecordMatcher = new DQMFBRecordMatcher(0.9);
combinedRecordMatcher.getMatchers().add(dqmfbRecordMatcher);
List<List<Map<String, String>>> multiMatchRules = analysisMatchRecordGrouping.getMultiMatchRules();
List<Map<String, String>> matchRuleList = new ArrayList<Map<String, String>>();
Map<String, String> matchKeyMap1 = new HashMap<String, String>();
Map<String, String> matchKeyMap2 = new HashMap<String, String>();
Map<String, String> matchKeyMap3 = new HashMap<String, String>();
matchKeyMap1.put(IRecordGrouping.MATCHING_TYPE, AttributeMatcherType.EXACT.name());
// change by
matchKeyMap2.put(IRecordGrouping.MATCHING_TYPE, AttributeMatcherType.DUMMY.name());
// DefaultSurvivorshipDefinitions
// change by
matchKeyMap3.put(IRecordGrouping.MATCHING_TYPE, AttributeMatcherType.DUMMY.name());
// ParticularDefaultSurvivorshipDefinitions
matchRuleList.add(matchKeyMap1);
matchRuleList.add(matchKeyMap2);
matchRuleList.add(matchKeyMap3);
// $NON-NLS-1$
matchKeyMap1.put(IRecordGrouping.MATCH_KEY_NAME, "a1");
// $NON-NLS-1$
matchKeyMap2.put(IRecordGrouping.MATCH_KEY_NAME, "a2");
// $NON-NLS-1$
matchKeyMap3.put(IRecordGrouping.MATCH_KEY_NAME, "a3");
// $NON-NLS-1$
matchKeyMap1.put(IRecordGrouping.COLUMN_IDX, "0");
// $NON-NLS-1$
matchKeyMap2.put(IRecordGrouping.COLUMN_IDX, "1");
// $NON-NLS-1$
matchKeyMap3.put(IRecordGrouping.COLUMN_IDX, "2");
multiMatchRules.add(matchRuleList);
SurvivorShipAlgorithmParams createSurvivorShipAlgorithmParams = AnalysisRecordGroupingUtils.createSurvivorShipAlgorithmParams(analysisMatchRecordGrouping, recordMatchingIndicator, columnMap);
// $NON-NLS-1$
Assert.assertEquals(// $NON-NLS-1$
"The size of SurvivorShipAlgos should be 2", // $NON-NLS-1$
2, createSurvivorShipAlgorithmParams.getSurviorShipAlgos().length);
// the size of default survivorshipRules is come from by (column size * default item size)
Assert.assertEquals("The size of DefaultSurviorshipRules should be 3", 3, // $NON-NLS-1$
createSurvivorShipAlgorithmParams.getDefaultSurviorshipRules().size());
Map<IRecordMatcher, SurvivorshipFunction[]> survivorshipAlgosMap = createSurvivorShipAlgorithmParams.getSurvivorshipAlgosMap();
// $NON-NLS-1$
Assert.assertEquals("The size of survivorshipAlgosMap should be 1", 1, survivorshipAlgosMap.size());
SurvivorshipFunction[] survivorshipFunctions = survivorshipAlgosMap.get(dqmfbRecordMatcher);
// $NON-NLS-1$
Assert.assertEquals("The size of survivorshipFunctions should be 3", 3, survivorshipFunctions.length);
// $NON-NLS-1$
Assert.assertEquals(// $NON-NLS-1$
"The Algorithm of a1 function should be LONGEST", // $NON-NLS-1$
SurvivorShipAlgorithmEnum.LONGEST, survivorshipFunctions[0].getSurvivorShipAlgoEnum());
// $NON-NLS-1$
Assert.assertEquals(// $NON-NLS-1$
"The Algorithm of a2 function should be Concatenate", // $NON-NLS-1$
SurvivorShipAlgorithmEnum.CONCATENATE, survivorshipFunctions[1].getSurvivorShipAlgoEnum());
// $NON-NLS-1$
Assert.assertEquals(// $NON-NLS-1$
"The Algorithm of a3 function should be MostCommon", // $NON-NLS-1$
SurvivorShipAlgorithmEnum.MOST_COMMON, survivorshipFunctions[2].getSurvivorShipAlgoEnum());
}
use of org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams in project tdq-studio-se by Talend.
the class AnalysisMatchParameterAdapter method getAllSurvivorshipFunctions.
/*
* (non-Javadoc)
*
* @see org.talend.dataquality.record.linkage.grouping.adapter.MatchParameterAdapter#getAllSurvivorshipFunctions()
*/
@Override
public List<SurvivorshipFunction> getAllSurvivorshipFunctions() {
List<SurvivorshipFunction> survFunctions = new ArrayList<SurvivorshipFunction>();
// Survivorship functions.
List<SurvivorshipKeyDefinition> survivorshipKeyDefs = recordMatchingIndicator.getBuiltInMatchRuleDefinition().getSurvivorshipKeys();
for (SurvivorshipKeyDefinition survDef : survivorshipKeyDefs) {
SurvivorshipFunction func = new SurvivorShipAlgorithmParams().new SurvivorshipFunction();
func.setSurvivorShipKey(survDef.getName());
func.setParameter(survDef.getFunction().getAlgorithmParameters());
func.setSurvivorShipAlgoEnum(SurvivorShipAlgorithmEnum.getTypeBySavedValue(survDef.getFunction().getAlgorithmType()));
survFunctions.add(func);
}
return survFunctions;
}
use of org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams in project tdq-studio-se by Talend.
the class AnalysisRecordGroupingUtils method initialMatchGrouping.
/**
* DOC yyin Comment method "initialMatchGrouping".
*
* @param columnMap
* @param recordMatchingIndicator
* @param analysisMatchRecordGrouping
* @throws InstantiationException
* @throws IllegalAccessException
* @throws ClassNotFoundException
*/
public static void initialMatchGrouping(Map<MetadataColumn, String> columnMap, RecordMatchingIndicator recordMatchingIndicator, AnalysisMatchRecordGrouping analysisMatchRecordGrouping) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
if (recordMatchingIndicator.getBuiltInMatchRuleDefinition().getRecordLinkageAlgorithm().equals(RecordMatcherType.simpleVSRMatcher.name())) {
analysisMatchRecordGrouping.setRecordLinkAlgorithm(RecordMatcherType.simpleVSRMatcher);
analysisMatchRecordGrouping.initialize();
} else {
analysisMatchRecordGrouping.setRecordLinkAlgorithm(RecordMatcherType.T_SwooshAlgorithm);
analysisMatchRecordGrouping.setOrginalInputColumnSize(columnMap.size() + 1);
analysisMatchRecordGrouping.initialize();
SurvivorShipAlgorithmParams survivorShipAlgorithmParams = createSurvivorShipAlgorithmParams(analysisMatchRecordGrouping, recordMatchingIndicator, columnMap);
analysisMatchRecordGrouping.setSurvivorShipAlgorithmParams(survivorShipAlgorithmParams);
}
}
use of org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams in project tdq-studio-se by Talend.
the class AnalysisMatchParameterAdapter method putNewSurvFunc.
/**
* Create a new surv function and put it into map given column index as the key.
*
* @param columnMap
* @param survivorShipAlgorithmParams
* @param defaultSurvRules
* @param metaColumn
* @param defSurvDef
*/
private static void putNewSurvFunc(Map<MetadataColumn, String> columnMap, Map<Integer, SurvivorshipFunction> defaultSurvRules, MetadataColumn metaColumn, DefaultSurvivorshipDefinition defSurvDef) {
SurvivorshipFunction survFunc = new SurvivorShipAlgorithmParams().new SurvivorshipFunction();
survFunc.setParameter(defSurvDef.getFunction().getAlgorithmParameters());
survFunc.setSurvivorShipAlgoEnum(SurvivorShipAlgorithmEnum.getTypeBySavedValue(defSurvDef.getFunction().getAlgorithmType()));
defaultSurvRules.put(Integer.valueOf(columnMap.get(metaColumn)), survFunc);
}
use of org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams in project tdq-studio-se by Talend.
the class AnalysisMatchParameterAdapter method getSurvivorshipAlgosMap.
/*
* (non-Javadoc)
*
* @see org.talend.dataquality.record.linkage.grouping.adapter.MatchParameterAdapter#getSurvivorshipAlgosMap(java.util.Map)
*/
@Override
public Map<IRecordMatcher, SurvivorshipFunction[]> getSurvivorshipAlgosMap(Map<Integer, SurvivorshipFunction> colIdx2DefaultSurvFunc, List<SurvivorshipFunction> survFunctions) {
Map<IRecordMatcher, SurvivorshipFunction[]> survAlgos = new HashMap<IRecordMatcher, SurvivorshipFunction[]>();
int matchRuleIdx = -1;
List<List<Map<String, String>>> multiRules = analysisMatchRecordGrouping.getMultiMatchRules();
for (List<Map<String, String>> matchrule : multiRules) {
matchRuleIdx++;
if (matchrule == null) {
continue;
}
SurvivorshipFunction[] surFuncsInMatcher = new SurvivorshipFunction[matchrule.size()];
int idx = 0;
for (Map<String, String> mkDef : matchrule) {
String matcherType = mkDef.get(IRecordGrouping.MATCHING_TYPE);
if (AttributeMatcherType.DUMMY.name().equalsIgnoreCase(matcherType)) {
// Find the func from default survivorship rule.
surFuncsInMatcher[idx] = colIdx2DefaultSurvFunc.get(Integer.valueOf(mkDef.get(IRecordGrouping.COLUMN_IDX)));
if (surFuncsInMatcher[idx] == null) {
// Use CONCATENATE by default if not specified .
surFuncsInMatcher[idx] = new SurvivorShipAlgorithmParams().new SurvivorshipFunction();
surFuncsInMatcher[idx].setSurvivorShipAlgoEnum(SurvivorShipAlgorithmEnum.MOST_COMMON);
// MOD TDQ-11774 set a default parameter
surFuncsInMatcher[idx].setParameter(SurvivorshipUtils.DEFAULT_CONCATENATE_PARAMETER);
}
} else {
// Find the func from existing survivorship rule list.
for (SurvivorshipFunction survFunc : survFunctions) {
String keyName = mkDef.get(IRecordGrouping.MATCH_KEY_NAME);
if (keyName.equals(survFunc.getSurvivorShipKey())) {
surFuncsInMatcher[idx] = survFunc;
break;
}
}
}
idx++;
}
// Add the funcs to a specific record matcher. NOTE that the index of matcher must be coincidence to the
// index of match rule.
survAlgos.put(this.getCombinedRecordMatcher().getMatchers().get(matchRuleIdx), surFuncsInMatcher);
}
return survAlgos;
}
Aggregations