Search in sources :

Example 6 with DuplicateCountIndicator

use of org.talend.dataquality.indicators.DuplicateCountIndicator in project tdq-studio-se by Talend.

the class ColumnSetMultiValueIndicatorImpl method basicSetDuplicateCountIndicator.

/**
 * <!-- begin-user-doc --> <!-- end-user-doc -->
 * @generated
 */
public NotificationChain basicSetDuplicateCountIndicator(DuplicateCountIndicator newDuplicateCountIndicator, NotificationChain msgs) {
    DuplicateCountIndicator oldDuplicateCountIndicator = duplicateCountIndicator;
    duplicateCountIndicator = newDuplicateCountIndicator;
    if (eNotificationRequired()) {
        ENotificationImpl notification = new ENotificationImpl(this, Notification.SET, ColumnsetPackage.COLUMN_SET_MULTI_VALUE_INDICATOR__DUPLICATE_COUNT_INDICATOR, oldDuplicateCountIndicator, newDuplicateCountIndicator);
        if (msgs == null)
            msgs = notification;
        else
            msgs.add(notification);
    }
    return msgs;
}
Also used : DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) ENotificationImpl(org.eclipse.emf.ecore.impl.ENotificationImpl)

Example 7 with DuplicateCountIndicator

use of org.talend.dataquality.indicators.DuplicateCountIndicator in project tdq-studio-se by Talend.

the class DelimitedFileIndicatorEvaluator method executeSqlQuery.

@Override
protected ReturnCode executeSqlQuery(String sqlStatement) {
    ReturnCode returnCode = new ReturnCode(true);
    if (delimitedFileconnection == null) {
        delimitedFileconnection = (DelimitedFileConnection) analysis.getContext().getConnection();
    }
    if (delimitedFileconnection.isContextMode()) {
        IRepositoryContextService service = CoreRuntimePlugin.getInstance().getRepositoryContextService();
        delimitedFileconnection = (DelimitedFileConnection) service.cloneOriginalValueConnection(delimitedFileconnection);
    }
    String path = JavaSqlFactory.getURL(delimitedFileconnection);
    IPath iPath = new Path(path);
    File file = iPath.toFile();
    if (!file.exists()) {
        // $NON-NLS-1$
        returnCode.setReturnCode(Messages.getString("DelimitedFileIndicatorEvaluator.CanNotFindFile"), false);
        return returnCode;
    }
    List<ModelElement> analysisElementList = this.analysis.getContext().getAnalysedElements();
    EMap<Indicator, AnalyzedDataSet> indicToRowMap = analysis.getResults().getIndicToRowMap();
    indicToRowMap.clear();
    List<MetadataColumn> columnElementList = new ArrayList<MetadataColumn>();
    for (int i = 0; i < analysisElementList.size(); i++) {
        MetadataColumn mColumn = (MetadataColumn) analysisElementList.get(i);
        MetadataTable mTable = ColumnHelper.getColumnOwnerAsMetadataTable(mColumn);
        columnElementList = mTable == null ? columnElementList : mTable.getColumns();
        if (!columnElementList.isEmpty()) {
            break;
        }
    }
    ReturnCode readDataReturnCode = new ReturnCode(true);
    // use CsvReader to parse.
    if (Escape.CSV.equals(delimitedFileconnection.getEscapeType())) {
        readDataReturnCode = useCsvReader(file, analysisElementList, columnElementList, indicToRowMap);
    } else {
        readDataReturnCode = useDelimitedReader(analysisElementList, columnElementList, indicToRowMap);
    }
    // handle error message
    if (!readDataReturnCode.isOk()) {
        Display.getDefault().asyncExec(new Runnable() {

            public void run() {
                MessageDialog.openWarning(PlatformUI.getWorkbench().getActiveWorkbenchWindow().getShell(), // $NON-NLS-1$
                Messages.getString("DelimitedFileIndicatorEvaluator.badlyForm.Title"), // $NON-NLS-1$
                Messages.getString("DelimitedFileIndicatorEvaluator.badlyForm.Message"));
            }
        });
    }
    // Added yyin 20120608 TDQ-3589
    for (MetadataColumn col : columnElementList) {
        List<Indicator> indicators = getIndicators(col.getLabel());
        for (Indicator indicator : indicators) {
            if (indicator instanceof DuplicateCountIndicator) {
                AnalyzedDataSet analyzedDataSet = indicToRowMap.get(indicator);
                if (analyzedDataSet == null) {
                    analyzedDataSet = AnalysisFactory.eINSTANCE.createAnalyzedDataSet();
                    indicToRowMap.put(indicator, analyzedDataSet);
                    analyzedDataSet.setDataCount(analysis.getParameters().getMaxNumberRows());
                    analyzedDataSet.setRecordSize(0);
                }
                // indicator.finalizeComputation();
                addResultToIndicatorToRowMap(indicator, indicToRowMap);
            }
        }
    }
    return returnCode;
}
Also used : IPath(org.eclipse.core.runtime.IPath) Path(org.eclipse.core.runtime.Path) DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) ReturnCode(org.talend.utils.sugars.ReturnCode) IPath(org.eclipse.core.runtime.IPath) AnalyzedDataSet(org.talend.dataquality.analysis.AnalyzedDataSet) ArrayList(java.util.ArrayList) IRepositoryContextService(org.talend.core.IRepositoryContextService) UniqueCountIndicator(org.talend.dataquality.indicators.UniqueCountIndicator) Indicator(org.talend.dataquality.indicators.Indicator) DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) RowCountIndicator(org.talend.dataquality.indicators.RowCountIndicator) ModelElement(orgomg.cwm.objectmodel.core.ModelElement) MetadataColumn(org.talend.core.model.metadata.builder.connection.MetadataColumn) MetadataTable(org.talend.core.model.metadata.builder.connection.MetadataTable) File(java.io.File)

Example 8 with DuplicateCountIndicator

use of org.talend.dataquality.indicators.DuplicateCountIndicator in project tdq-studio-se by Talend.

the class DelimitedFileIndicatorEvaluator method handleByARow.

private ReturnCode handleByARow(String[] rowValues, long currentRow, List<ModelElement> analysisElementList, List<MetadataColumn> columnElementList, EMap<Indicator, AnalyzedDataSet> indicToRowMap) {
    ReturnCode returnCode = new ReturnCode(true);
    Object object = null;
    int maxNumberRows = analysis.getParameters().getMaxNumberRows();
    int recordIncrement = 0;
    element: for (int i = 0; i < analysisElementList.size(); i++) {
        MetadataColumn mColumn = (MetadataColumn) analysisElementList.get(i);
        Integer position = ColumnHelper.getColumnIndex(mColumn);
        // warning with a file of badly form
        if (position == null || position >= rowValues.length) {
            log.warn(// $NON-NLS-1$
            Messages.getString(// $NON-NLS-1$
            "DelimitedFileIndicatorEvaluator.incorrectData", mColumn.getLabel(), currentRow, delimitedFileconnection.getFilePath()));
            returnCode.setOk(false);
            continue;
        }
        object = TalendTypeConvert.convertToObject(mColumn.getTalendType(), rowValues[position], mColumn.getPattern());
        List<Indicator> indicators = getIndicators(mColumn.getLabel());
        for (Indicator indicator : indicators) {
            if (!continueRun()) {
                break element;
            }
            // bug 19036,to irregularly data,still compute for RowCountIndicator
            if (object == null && !(indicator instanceof RowCountIndicator)) {
                continue element;
            }
            // Added yyin 20120608 TDQ-3589
            if (indicator instanceof DuplicateCountIndicator) {
                ((DuplicateCountIndicator) indicator).handle(object, rowValues);
            } else {
                // ~
                indicator.handle(object);
            }
            AnalyzedDataSet analyzedDataSet = indicToRowMap.get(indicator);
            if (analyzedDataSet == null) {
                analyzedDataSet = AnalysisFactory.eINSTANCE.createAnalyzedDataSet();
                indicToRowMap.put(indicator, analyzedDataSet);
                analyzedDataSet.setDataCount(maxNumberRows);
                analyzedDataSet.setRecordSize(0);
            }
            // see IndicatorEvaluator line 166, the logic is almost the same
            if (analysis.getParameters().isStoreData()) {
                if (indicator.mustStoreRow()) {
                    List<Object[]> valueObjectList = initDataSet(indicator, indicToRowMap, object);
                    recordIncrement = valueObjectList.size();
                    List<Object> inputRowList = new ArrayList<Object>();
                    for (int j = 0; j < rowValues.length; j++) {
                        Object newobject = rowValues[j];
                        if (indicator.isUsedMapDBMode()) {
                            inputRowList.add(newobject == null ? PluginConstant.NULL_STRING : newobject);
                            continue;
                        } else {
                            if (recordIncrement < maxNumberRows) {
                                if (recordIncrement < valueObjectList.size()) {
                                    valueObjectList.get(recordIncrement)[j] = newobject;
                                } else {
                                    Object[] valueObject = new Object[rowValues.length];
                                    valueObject[j] = newobject;
                                    valueObjectList.add(valueObject);
                                }
                            } else {
                                break;
                            }
                        }
                    }
                    if (indicator.isUsedMapDBMode()) {
                        MapDBUtils.handleDrillDownData(object, inputRowList, indicator);
                    }
                } else if (indicator instanceof UniqueCountIndicator && analysis.getResults().getIndicToRowMap().get(indicator).getData() != null) {
                    List<Object[]> removeValueObjectList = analysis.getResults().getIndicToRowMap().get(indicator).getData();
                    if (columnElementList.size() == 0) {
                        continue;
                    }
                    int offsetting = columnElementList.indexOf(indicator.getAnalyzedElement());
                    for (Object[] dataObject : removeValueObjectList) {
                        // Added yyin 20120611 TDQ5279
                        if (object instanceof Integer) {
                            if (object.equals(Integer.parseInt((String) dataObject[offsetting]))) {
                                removeValueObjectList.remove(dataObject);
                                break;
                            }
                        }
                        // ~
                        if (dataObject[offsetting].equals(object)) {
                            removeValueObjectList.remove(dataObject);
                            break;
                        }
                    }
                }
            }
        }
    }
    return returnCode;
}
Also used : DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) ReturnCode(org.talend.utils.sugars.ReturnCode) AnalyzedDataSet(org.talend.dataquality.analysis.AnalyzedDataSet) UniqueCountIndicator(org.talend.dataquality.indicators.UniqueCountIndicator) UniqueCountIndicator(org.talend.dataquality.indicators.UniqueCountIndicator) Indicator(org.talend.dataquality.indicators.Indicator) DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) RowCountIndicator(org.talend.dataquality.indicators.RowCountIndicator) MetadataColumn(org.talend.core.model.metadata.builder.connection.MetadataColumn) ArrayList(java.util.ArrayList) List(java.util.List) RowCountIndicator(org.talend.dataquality.indicators.RowCountIndicator)

Example 9 with DuplicateCountIndicator

use of org.talend.dataquality.indicators.DuplicateCountIndicator in project tdq-studio-se by Talend.

the class CountsIndicatorImpl method basicSetDuplicateCountIndicator.

/**
 * <!-- begin-user-doc --> <!-- end-user-doc -->
 * @generated
 */
public NotificationChain basicSetDuplicateCountIndicator(DuplicateCountIndicator newDuplicateCountIndicator, NotificationChain msgs) {
    DuplicateCountIndicator oldDuplicateCountIndicator = duplicateCountIndicator;
    duplicateCountIndicator = newDuplicateCountIndicator;
    if (eNotificationRequired()) {
        ENotificationImpl notification = new ENotificationImpl(this, Notification.SET, IndicatorsPackage.COUNTS_INDICATOR__DUPLICATE_COUNT_INDICATOR, oldDuplicateCountIndicator, newDuplicateCountIndicator);
        if (msgs == null)
            msgs = notification;
        else
            msgs.add(notification);
    }
    return msgs;
}
Also used : DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) ENotificationImpl(org.eclipse.emf.ecore.impl.ENotificationImpl)

Example 10 with DuplicateCountIndicator

use of org.talend.dataquality.indicators.DuplicateCountIndicator in project tdq-studio-se by Talend.

the class IndicatorEvaluationMain method main.

/**
 * DOC scorreia Comment method "main".
 *
 * @param args
 */
public static void main(String[] args) {
    TypedProperties connectionParams = PropertiesLoader.getProperties(IndicatorEvaluator.class, "db.properties");
    String driverClassName = connectionParams.getProperty("driver");
    String dbUrl = connectionParams.getProperty("url");
    try {
        TimeTracer tt = new TimeTracer("Indicator evaluation", null);
        tt.start();
        // create connection
        Connection connection = ConnectionUtils.createConnection(dbUrl, driverClassName, connectionParams);
        String database = "test";
        String tableName = "my_test";
        // --- columns to analyze
        String[] columnsArray = new String[] { // 0
        "my_int", // 1
        "my_double", // 2
        "my_text", // 4
        "my_date", // 3
        "my_string", // 5
        "my_int_null" };
        List<String> columns = Arrays.asList(columnsArray);
        // store in file
        File file = new File("out/columnTest_0.1.ana");
        EMFUtil util = new EMFUtil();
        Resource resource = util.getResourceSet().createResource(URI.createFileURI(file.getAbsolutePath()));
        rContents = resource.getContents();
        evaluator.setConnection(connection);
        // --- create indicators
        RowCountIndicator rowCountIndicator = IndicatorsFactory.eINSTANCE.createRowCountIndicator();
        NullCountIndicator nullCountIndicator = IndicatorsFactory.eINSTANCE.createNullCountIndicator();
        DistinctCountIndicator distinctCountIndicator = IndicatorsFactory.eINSTANCE.createDistinctCountIndicator();
        DistinctCountIndicator distinctCountIndicator2 = IndicatorsFactory.eINSTANCE.createDistinctCountIndicator();
        UniqueCountIndicator uniqueCountIndicator = IndicatorsFactory.eINSTANCE.createUniqueCountIndicator();
        DuplicateCountIndicator duplicateCountIndicator = IndicatorsFactory.eINSTANCE.createDuplicateCountIndicator();
        BlankCountIndicator blankCountIndicator = IndicatorsFactory.eINSTANCE.createBlankCountIndicator();
        MinLengthIndicator minLengthIndicator = IndicatorsFactory.eINSTANCE.createMinLengthIndicator();
        MaxLengthIndicator maxLengthIndicator = IndicatorsFactory.eINSTANCE.createMaxLengthIndicator();
        AverageLengthIndicator averageLengthIndicator = IndicatorsFactory.eINSTANCE.createAverageLengthIndicator();
        AverageLengthIndicator averageLengthIndicator2 = IndicatorsFactory.eINSTANCE.createAverageLengthIndicator();
        ModeIndicator modeIndicator = IndicatorsFactory.eINSTANCE.createModeIndicator();
        FrequencyIndicator textFrequencyIndicator = IndicatorsFactory.eINSTANCE.createFrequencyIndicator();
        // store in freq indic
        // textFrequencyIndicator.setDistinctCountIndicator(distinctCountIndicator);
        // textFrequencyIndicator.setDistinctCountIndicator(distinctCountIndicator2);
        // textFrequencyIndicator.setUniqueCountIndicator(uniqueCountIndicator);
        // textFrequencyIndicator.setDuplicateCountIndicator(duplicateCountIndicator);
        // textFrequencyIndicator.setModeIndicator(modeIndicator);
        MeanIndicator doubleMeanIndicator = IndicatorsFactory.eINSTANCE.createMeanIndicator();
        MeanIndicator integerMeanIndicator = IndicatorsFactory.eINSTANCE.createMeanIndicator();
        MedianIndicator medianIndicator = IndicatorsFactory.eINSTANCE.createMedianIndicator();
        SumIndicator integerSumIndicator = IndicatorsFactory.eINSTANCE.createSumIndicator();
        addIndicator(columnsArray[0], medianIndicator);
        addIndicator(columnsArray[1], doubleMeanIndicator);
        addIndicator(columnsArray[2], blankCountIndicator);
        addIndicator(columnsArray[5], nullCountIndicator);
        // addIndicator(columnsArray[2], textFrequencyIndicator);
        // addIndicator(columnsArray[2], distinctCountIndicator); // probably not useful?
        // addIndicator(columnsArray[2], uniqueCountIndicator); // probably not useful?
        // addIndicator(columnsArray[2], duplicateCountIndicator); // probably not useful?
        // addIndicator(columnsArray[2], modeIndicator); // probably not useful?
        addIndicator(columnsArray[3], rowCountIndicator);
        addIndicator(columnsArray[5], integerSumIndicator);
        addIndicator(columnsArray[5], integerMeanIndicator);
        addIndicator(columnsArray[2], averageLengthIndicator);
        addIndicator(columnsArray[3], averageLengthIndicator2);
        addIndicator(columnsArray[3], minLengthIndicator);
        addIndicator(columnsArray[3], maxLengthIndicator);
        // build query on columns
        // TODO scorreia add filter somewhere here...
        String selectCols = sqlSelectColumns(database, tableName, columns);
        // --- create a description of the column set
        QueryExpression queryExpression = DatatypesFactory.eINSTANCE.createQueryExpression();
        queryExpression.setBody(selectCols);
        // TODO scorreia externalize this as a constant
        queryExpression.setLanguage("SQL");
        tt.start("compute");
        evaluator.setFetchSize(10000);
        evaluator.evaluateIndicators(selectCols, true);
        tt.end("compute");
        // Print indicators the median
        System.out.println("Median=" + medianIndicator.getMedian());
        System.out.println("# Unique values= " + textFrequencyIndicator.getUniqueValueCount());
        System.out.println("# Distinct values= " + textFrequencyIndicator.getDistinctValueCount());
        for (String col : columns) {
            printIndicators(evaluator.getIndicators(col));
        }
        tt.start("save");
        util.save();
        tt.end("saved in " + file.getAbsolutePath());
        tt.end();
        CwmResource cwmR = (CwmResource) resource;
        String id = cwmR.getID(medianIndicator);
        System.out.println("ecore util.getId= " + EcoreUtil.getID(medianIndicator));
        System.out.println("uuId= " + id);
    // test reload this file
    // LoadSerialData.main(args);
    } catch (SQLException e) {
        // TODO Auto-generated catch block
        log.error(e, e);
    } catch (InstantiationException e) {
        // TODO Auto-generated catch block
        log.error(e, e);
    } catch (IllegalAccessException e) {
        // TODO Auto-generated catch block
        log.error(e, e);
    } catch (ClassNotFoundException e) {
        // TODO Auto-generated catch block
        log.error(e, e);
    }
}
Also used : SumIndicator(org.talend.dataquality.indicators.SumIndicator) SQLException(java.sql.SQLException) BlankCountIndicator(org.talend.dataquality.indicators.BlankCountIndicator) MedianIndicator(org.talend.dataquality.indicators.MedianIndicator) UniqueCountIndicator(org.talend.dataquality.indicators.UniqueCountIndicator) ModeIndicator(org.talend.dataquality.indicators.ModeIndicator) NullCountIndicator(org.talend.dataquality.indicators.NullCountIndicator) FrequencyIndicator(org.talend.dataquality.indicators.FrequencyIndicator) DistinctCountIndicator(org.talend.dataquality.indicators.DistinctCountIndicator) RowCountIndicator(org.talend.dataquality.indicators.RowCountIndicator) QueryExpression(orgomg.cwm.foundation.datatypes.QueryExpression) MeanIndicator(org.talend.dataquality.indicators.MeanIndicator) CwmResource(org.talend.model.emf.CwmResource) DuplicateCountIndicator(org.talend.dataquality.indicators.DuplicateCountIndicator) MaxLengthIndicator(org.talend.dataquality.indicators.MaxLengthIndicator) Connection(java.sql.Connection) CwmResource(org.talend.model.emf.CwmResource) Resource(org.eclipse.emf.ecore.resource.Resource) MinLengthIndicator(org.talend.dataquality.indicators.MinLengthIndicator) TypedProperties(org.talend.utils.properties.TypedProperties) TimeTracer(org.talend.utils.time.TimeTracer) AverageLengthIndicator(org.talend.dataquality.indicators.AverageLengthIndicator) EMFUtil(org.talend.commons.emf.EMFUtil) File(java.io.File)

Aggregations

DuplicateCountIndicator (org.talend.dataquality.indicators.DuplicateCountIndicator)12 RowCountIndicator (org.talend.dataquality.indicators.RowCountIndicator)7 ArrayList (java.util.ArrayList)6 UniqueCountIndicator (org.talend.dataquality.indicators.UniqueCountIndicator)6 AnalyzedDataSet (org.talend.dataquality.analysis.AnalyzedDataSet)4 DistinctCountIndicator (org.talend.dataquality.indicators.DistinctCountIndicator)4 Indicator (org.talend.dataquality.indicators.Indicator)4 MaxLengthIndicator (org.talend.dataquality.indicators.MaxLengthIndicator)4 MinLengthIndicator (org.talend.dataquality.indicators.MinLengthIndicator)4 ReturnCode (org.talend.utils.sugars.ReturnCode)4 List (java.util.List)3 MetadataColumn (org.talend.core.model.metadata.builder.connection.MetadataColumn)3 AverageLengthIndicator (org.talend.dataquality.indicators.AverageLengthIndicator)3 BlankCountIndicator (org.talend.dataquality.indicators.BlankCountIndicator)3 MeanIndicator (org.talend.dataquality.indicators.MeanIndicator)3 MedianIndicator (org.talend.dataquality.indicators.MedianIndicator)3 File (java.io.File)2 ENotificationImpl (org.eclipse.emf.ecore.impl.ENotificationImpl)2 MetadataTable (org.talend.core.model.metadata.builder.connection.MetadataTable)2 TdColumn (org.talend.cwm.relational.TdColumn)2