use of org.talend.dataquality.indicators.columnset.RecordMatchingIndicator in project tdq-studio-se by Talend.
the class ExecuteMatchRuleHandlerTest method testExecute1.
/**
* Test method for
* {@link org.talend.dq.analysis.ExecuteMatchRuleHandler#execute(java.util.Map, org.talend.dataquality.indicators.columnset.RecordMatchingIndicator, java.util.List, org.talend.dataquality.indicators.columnset.BlockKeyIndicator)}
* .
*
* no block key one match key
*/
@Test
public void testExecute1() {
Map<MetadataColumn, String> columnMap = new HashMap<MetadataColumn, String>();
MetadataColumn col0 = ConnectionFactory.eINSTANCE.createMetadataColumn();
col0.setName(columnName0);
// $NON-NLS-1$
columnMap.put(col0, "0");
MetadataColumn col1 = ConnectionFactory.eINSTANCE.createMetadataColumn();
col1.setName(columnName1);
// $NON-NLS-1$
columnMap.put(col1, "1");
MetadataColumn col2 = ConnectionFactory.eINSTANCE.createMetadataColumn();
col2.setName(columnName2);
// $NON-NLS-1$
columnMap.put(col2, "2");
MetadataColumn col3 = ConnectionFactory.eINSTANCE.createMetadataColumn();
col3.setName(columnName3);
// $NON-NLS-1$
columnMap.put(col3, "3");
// create match key
RecordMatchingIndicator recordMatchingIndicator = ColumnsetFactory.eINSTANCE.createRecordMatchingIndicator();
MatchRuleDefinition matchRuleDef = RulesPackage.eINSTANCE.getRulesFactory().createMatchRuleDefinition();
recordMatchingIndicator.setBuiltInMatchRuleDefinition(matchRuleDef);
MatchRule createMatchRule1 = RulesFactory.eINSTANCE.createMatchRule();
MatchKeyDefinition createMatchKeyDefinition1 = RulesFactory.eINSTANCE.createMatchKeyDefinition();
createMatchRule1.getMatchKeys().add(createMatchKeyDefinition1);
createMatchKeyDefinition1.setColumn(columnName0);
createMatchKeyDefinition1.setConfidenceWeight(1);
// $NON-NLS-1$
createMatchKeyDefinition1.setName("rule1.matchkey1");
createMatchKeyDefinition1.setHandleNull(HandleNullEnum.NULL_MATCH_NULL.getValue());
AlgorithmDefinition createAlgorithmDefinition1 = RulesFactory.eINSTANCE.createAlgorithmDefinition();
createAlgorithmDefinition1.setAlgorithmType(AttributeMatcherType.EXACT.name());
createMatchKeyDefinition1.setAlgorithm(createAlgorithmDefinition1);
matchRuleDef.getMatchRules().add(createMatchRule1);
// input data
List<Object[]> matchRows = new ArrayList<Object[]>();
// $NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
matchRows.add(new String[] { "id1", "name1", "number1", "date1" });
// $NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
matchRows.add(new String[] { "id2", "name2", "number2", "date2" });
// $NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
matchRows.add(new String[] { "id3", "name1", "number3", "date3" });
// $NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
matchRows.add(new String[] { "id4", "name4", "number2", "date1" });
BlockKeyIndicator blockKeyIndicator = ColumnsetFactory.eINSTANCE.createBlockKeyIndicator();
ExecuteMatchRuleHandler execHandler = new ExecuteMatchRuleHandler();
MatchGroupResultConsumer matchResultConsumer = createMatchGroupResultConsumer(columnMap, recordMatchingIndicator);
TypedReturnCode<MatchGroupResultConsumer> executeResult = execHandler.execute(columnMap, recordMatchingIndicator, matchRows, blockKeyIndicator, matchResultConsumer);
Assert.assertTrue(executeResult.isOk());
Assert.assertTrue(executeResult.getMessage() == null);
Assert.assertTrue(executeResult.getObject() != null);
MatchGroupResultConsumer ResultConsumer = executeResult.getObject();
List<Object[]> fullMatchResult = ResultConsumer.getFullMatchResult();
Assert.assertTrue(fullMatchResult.size() == 4);
// every input data is master data
for (Object[] objectArray : fullMatchResult) {
Object object = objectArray[7];
Assert.assertTrue(Boolean.parseBoolean(object.toString()));
}
}
use of org.talend.dataquality.indicators.columnset.RecordMatchingIndicator in project tdq-studio-se by Talend.
the class MatchAnalysisExecutorTest method assertScenario1.
/**
* DOC zhao Comment method "assertScenario1".
*
* @param matchAnalysisExecutor
* @param analysis
* @param name
* @param nameVar
*/
private void assertScenario1(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) {
// Set indicators into analysis result.
RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator();
// Match key: name, no block key, levenshtein attribute algorithm.
matchIndicator.setAnalyzedElement(name);
createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval);
executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator);
// Assert group size and frequency.
Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency();
// For 4 -> "seb"
assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);
// For 1 -> "Sebastião","babass","nico","nicola"
assertTrue(size2Frequency.get(String.valueOf(1)) == 4l);
// For 2 -> "sebas","nicolas","nigula"
assertTrue(size2Frequency.get(String.valueOf(2)) == 3l);
// Assert row count, unique records, matched records and suspect records.
assertTrue(matchIndicator.getCount() == 14);
assertTrue(matchIndicator.getMatchedRecordCount() == 10);
assertTrue(matchIndicator.getSuspectRecordCount() == 0);
}
use of org.talend.dataquality.indicators.columnset.RecordMatchingIndicator in project tdq-studio-se by Talend.
the class MatchAnalysisExecutorTest method assertScenario2.
/**
* DOC zhao Comment method "assertScenario2".
*
* @param matchAnalysisExecutor
* @param analysis
* @param name
* @param nameVar
*/
private void assertScenario2(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) {
// Set indicators into analysis result.
RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator();
// Match key: name, no block key, levenshtein attribute algorithm.
matchIndicator.setAnalyzedElement(name);
createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval);
executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator);
// Assert group size and frequency.
Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency();
// For 4 -> "seb"
assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);
// For 1 -> "Sebastião","babass","nico"
assertTrue(size2Frequency.get(String.valueOf(1)) == 3l);
// For 3 -> "nicolas"("nicola")
assertTrue(size2Frequency.get(String.valueOf(3)) == 1l);
// For 2 -> "sebas","nigula"
assertTrue(size2Frequency.get(String.valueOf(2)) == 2l);
// Assert row count, unique records, matched records and suspect records.
assertTrue(matchIndicator.getCount() == 14);
assertTrue(matchIndicator.getMatchedRecordCount() == 11);
assertTrue(matchIndicator.getSuspectRecordCount() == 0);
}
use of org.talend.dataquality.indicators.columnset.RecordMatchingIndicator in project tdq-studio-se by Talend.
the class MatchAnalysisExecutorTest method assertScenario3.
/**
* DOC zhao Comment method "assertScenario3".
*
* @param matchAnalysisExecutor
* @param analysis
* @param name
* @param nameVar
*/
private void assertScenario3(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) {
// Set indicators into analysis result.
RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator();
// Match key: name, no block key, levenshtein attribute algorithm.
matchIndicator.setAnalyzedElement(name);
createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval);
executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator);
// Assert group size and frequency.
Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency();
// For 4 -> "seb"
assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);
// For 1 -> "Sebastião","babass","nico"
assertTrue(size2Frequency.get(String.valueOf(1)) == 3l);
// For 3 -> "nicolas"("nicola")
assertTrue(size2Frequency.get(String.valueOf(3)) == 1l);
// For 2 -> "sebas","nigula"
assertTrue(size2Frequency.get(String.valueOf(2)) == 2l);
// Assert row count, unique records, matched records and suspect records.
assertTrue(matchIndicator.getCount() == 14);
assertTrue(matchIndicator.getMatchedRecordCount() == 8);
// For 3 -> "nicolas"("nicola"), group score: 0.9 <
assertTrue(matchIndicator.getSuspectRecordCount() == 3);
// 0.95
}
use of org.talend.dataquality.indicators.columnset.RecordMatchingIndicator in project tdq-studio-se by Talend.
the class ItemRecord method includeCustomMatcherJarDependencies.
/**
* DOC zshen Comment method "includeCustomMatcherJarDependencies".
*
* @param matchAnalysis
*/
private void includeCustomMatcherJarDependencies(Analysis matchAnalysis) {
RecordMatchingIndicator recordMatchIndicatorFromAna = AnalysisHelper.getRecordMatchIndicatorFromAna(matchAnalysis);
MatchRuleDefinition builtInMatchRuleDefinition = recordMatchIndicatorFromAna.getBuiltInMatchRuleDefinition();
includeCustomMatcherJarDependencies(builtInMatchRuleDefinition);
}
Aggregations