Search in sources :

Example 1 with DoubleArray

use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.

the class Matchifier method match.

public void match() throws Exception {
    LEGATO legato = LEGATO.getInstance();
    // Final links
    MapList mapList = new MapList();
    // Recall++
    MapList mapList1 = new MapList();
    // Precision++
    MapList mapList2 = new MapList();
    /**
     *******
     * Getting vectors from "Source" and "Target" datasets
     ********
     */
    VectorGenerator vectorGenerator = new VectorGenerator();
    vectorGenerator.GetAllTerms();
    // List of "Source" and "Target" TF-IDF vectors
    DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
    // List of "Source" TF-IDF vectors with their "docName"
    HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
    // List of "Target" TF-IDF vectors with their "docName"
    HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
    for (// Identify "Source" and "Target" vectors
    DocVector doc : // Identify "Source" and "Target" vectors
    docVectors) {
        double[] vector = doc.getVector();
        if (doc.parentFolder.equals("source"))
            srcMap.put(doc.docName, vector);
        else if (doc.parentFolder.equals("target"))
            tgtMap.put(doc.docName, vector);
    }
    /**
     ******
     * "Hierarchical Clustering" on "Source" and "Target" datasets.
     * For each pair --> apply RANkey and link instances based on the
     * best key.
     *******
     */
    // List of "Source" clusters
    ClusterList srcClS = Clustering.getClusters(srcMap);
    // List of "target" clusters
    ClusterList tgtClS = Clustering.getClusters(tgtMap);
    /**
     ******
     * RANKey
     *******
     */
    int pairNumber = 0;
    // All pairs of clusters will be contained in folder "clusters"
    File dir = new File(legato.getPath() + File.separator + "clusters");
    dir.mkdirs();
    DistanceFunction distanceFunction = new DistanceCorrelation();
    for (// For each cluster from "Source" dataset
    Cluster clust1 : // For each cluster from "Source" dataset
    srcClS) {
        for (// For each cluster from "Target" dataset
        Cluster clust2 : // For each cluster from "Target" dataset
        tgtClS) {
            DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
            DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
            if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
                if (// If both clusters contain more than one instance
                clust1.size() > 1 && clust2.size() > 1) {
                    pairNumber = pairNumber + 1;
                    // Both clusters will be contained in folder "pairNumber"
                    File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
                    dirClusters.mkdirs();
                    /**
                     *****
                     * Retrieve RDF Model from the 2 clusters
                     ******
                     */
                    Model srcModel = ModelManager.loadModel(legato.src.toString());
                    Model model1 = ModelFactory.createDefaultModel();
                    String[] resources = clust1.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getSrcURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model1.add(CBDBuilder.getCBD(srcModel, resource));
                    }
                    Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
                    Model model2 = ModelFactory.createDefaultModel();
                    resources = clust2.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getTgtURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model2.add(CBDBuilder.getCBD(tgtModel, resource));
                    }
                    /**
                     **********
                     * Execute RANKey
                     ***********
                     */
                    HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
                    if (!(bestKey == null)) {
                        /**
                         **********
                         * Execute SILK
                         ***********
                         */
                        SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
                        SILK.link(dirClusters.toString());
                        File file = new File(dirClusters.toString() + File.separator + "links.rdf");
                        AlignmentParser aparser = new AlignmentParser(0);
                        Alignment links = aparser.parse(file.toURI());
                        for (Cell cell : links) {
                            mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
                        }
                    }
                } else if (clust1.size() == 1 && clust2.size() == 1) {
                // mapList2.add("http://data.doremus.org/expression/"+clust1.getExemplar().getID(), "http://data.doremus.org/expression/"+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
                }
            }
        }
    }
    /**
     ***
     * Comparison
     ****
     */
    System.out.println("comparison");
    for (int i = 0; i < docVectors.length; i++) {
        DocVector srcDoc = docVectors[i];
        String tgtDoc = null;
        double simVal = 0;
        for (int j = 0; j < docVectors.length; j++) {
            if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
                if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
                    tgtDoc = docVectors[j].docName;
                    simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
                }
            }
        }
        if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
            /*	Model srcModel = ModelManager.loadModel(legato.src.toString());
	    		Model model1 = ModelFactory.createDefaultModel();
	    		Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
	    		String str1 = legato.getType(rsrce1, srcModel).toString();
	    		
	    		Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
	    		Model model2 = ModelFactory.createDefaultModel();
	    		Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
	    		String str2 = legato.getType(rsrce2, tgtModel).toString();
	    				
	    		if (str1.equals(str2)) */
            mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
        }
    }
    /**
     ***********
     * Link repairing
     ***********
     */
    for (Map map1 : mapList1) {
        boolean exist = false;
        for (Map map2 : mapList2) {
            if (map1.getSourceURI().equals(map2.getSourceURI())) {
                if (map1.getTargetURI().equals(map2.getTargetURI()))
                    System.out.println("OUI");
                else
                    System.out.println("NON");
                exist = true;
                mapList.add(map2);
            }
        }
        if (exist == false)
            mapList.add(map1);
    }
    for (Map map2 : mapList2) {
        boolean exist = false;
        for (Map map1 : mapList1) {
            if (map2.getSourceURI().equals(map1.getSourceURI())) {
                exist = true;
            }
        }
        if (exist == false) {
            System.out.println("+1");
            mapList.add(map2);
        }
    }
    /**
     *******
     ** Create and save the alignment file
     ********
     */
    File dirr = new File(legato.getPath() + File.separator + "docs");
    delete(dirr);
    File dirind = new File(legato.getPath() + File.separator + "index");
    delete(dirind);
    File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
    srcFile.deleteOnExit();
    File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
    tgtFile.deleteOnExit();
    File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
    txtFile.deleteOnExit();
    Align.saveMappings(mapList);
}
Also used : ClusterList(legato.cluster.ClusterList) HashMap(java.util.HashMap) Alignment(org.semanticweb.owl.align.Alignment) LEGATO(legato.LEGATO) Cell(org.semanticweb.owl.align.Cell) Resource(org.apache.jena.rdf.model.Resource) Cluster(legato.cluster.Cluster) DocVector(legato.indexer.DocVector) AlignmentParser(fr.inrialpes.exmo.align.parser.AlignmentParser) DistanceFunction(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction) Model(org.apache.jena.rdf.model.Model) VectorGenerator(legato.indexer.VectorGenerator) File(java.io.File) DistanceCorrelation(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray) HashMap(java.util.HashMap)

Example 2 with DoubleArray

use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.

the class PropertyHandler method clean.

/**
 *****
 * This class deletes problematic properties
 ******
 */
public static void clean(String srcPath, String tgtPath) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    Model srcModel = ModelManager.loadModel(srcPath);
    Model tgtModel = ModelManager.loadModel(tgtPath);
    Model s = ModelFactory.createDefaultModel();
    Model t = ModelFactory.createDefaultModel();
    s = ModelManager.rewrite(srcModel, false);
    t = ModelManager.rewrite(tgtModel, false);
    Model mergedModel = ModelFactory.createDefaultModel();
    mergedModel.add(s);
    mergedModel.add(t);
    List<Resource> properties = getDistinctProperties(mergedModel);
    System.out.println(legato.getPropList());
    HashMap<String, String> propScoreList = new HashMap<String, String>();
    properties.forEach((property) -> {
        propScoreList.put(property.toString(), String.valueOf(getScore(property, mergedModel)));
    });
    ValueComparator<String> comp = new ValueComparator<String>(propScoreList);
    TreeMap<String, String> mapTriee = new TreeMap<String, String>(comp);
    mapTriee.putAll(propScoreList);
    System.out.println(mapTriee);
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < mapTriee.entrySet().size(); i++) {
        sb.append(Double.valueOf((String) mapTriee.values().toArray()[i]) + "\n");
    }
    ;
    FileManager.create("nom", sb.toString().trim());
    int minPts = 1;
    double epsilon = 5d;
    AlgoDBSCAN algo = new AlgoDBSCAN();
    List<Cluster> clusters = algo.runAlgorithm(legato.getPath() + File.separator + "nom.txt", minPts, epsilon, "\n");
    algo.printStatistics();
    double highMean = 0;
    double[] heterCluster = null;
    for (Cluster cluster : clusters) {
        double[] arr = new double[cluster.getVectors().size()];
        int i = 0;
        for (DoubleArray dataPoint : cluster.getVectors()) {
            arr[i++] = dataPoint.data[0];
        }
        A a = new A(arr);
        if (highMean < a.getMean()) {
            highMean = a.getMean();
            heterCluster = arr;
        }
        ;
    }
    List<String> propList = new ArrayList<String>();
    Iterator it = mapTriee.entrySet().iterator();
    while (it.hasNext()) {
        Entry<String, String> entry = (Entry<String, String>) it.next();
        boolean f = false;
        for (int i = 0; i < heterCluster.length; i++) {
            if (String.valueOf(heterCluster[i]).equals(entry.getValue()))
                propList.add(entry.getKey());
            ;
        }
    }
    System.out.println(propList);
    srcModel = ModelManager.rewrite(srcModel, true);
    System.out.println("source");
    tgtModel = ModelManager.rewrite(tgtModel, true);
    Model srcFinalModel = ModelFactory.createDefaultModel();
    srcModel.listStatements().toSet().forEach((stmt) -> {
        Property property = stmt.getPredicate();
        if (!(propList.contains(property.toString()))) {
            srcFinalModel.add(stmt);
        }
    });
    Model tgtFinalModel = ModelFactory.createDefaultModel();
    tgtModel.listStatements().toSet().forEach((stmt) -> {
        Property property = stmt.getPredicate();
        if (!propList.contains(property.toString())) {
            tgtFinalModel.add(stmt);
        }
    });
    // FileManager.createRDFile(new File(legato.getPath()+"store"), "source", srcFinalModel, "TTL");
    // FileManager.createRDFile(new File(legato.getPath()+"store"), "target", tgtFinalModel, "TTL");
    legato.setSource(FileManager.getCreatedRDFile("source", srcFinalModel));
    legato.setTarget(FileManager.getCreatedRDFile("target", tgtFinalModel));
    System.out.println("finish");
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Entry(java.util.Map.Entry) LEGATO(legato.LEGATO) Iterator(java.util.Iterator) Property(org.apache.jena.rdf.model.Property) Resource(org.apache.jena.rdf.model.Resource) Cluster(ca.pfv.spmf.patterns.cluster.Cluster) TreeMap(java.util.TreeMap) AlgoDBSCAN(ca.pfv.spmf.algorithms.clustering.dbscan.AlgoDBSCAN) Model(org.apache.jena.rdf.model.Model) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray)

Example 3 with DoubleArray

use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.

the class Clustering method getClusters.

public static ClusterList getClusters(HashMap<String, double[]> docs) throws NumberFormatException, IOException {
    // double maxdistance = 0.415; //meilleur seuil sur DS_SM
    double maxdistance = 0.2;
    DistanceFunction distanceFunction = new DistanceCorrelation();
    HierarchicalClustering algo = new HierarchicalClustering();
    List<ClusterWithMean> clusters = algo.runAlgorithm(docs, maxdistance, distanceFunction);
    ClusterList clusterList = new ClusterList();
    for (// For each cluster
    ClusterWithMean clust : // For each cluster
    algo.clusters) {
        Cluster cluster = new Cluster();
        for (// For each vector
        DoubleArray vector : // For each vector
        clust.getVectors()) {
            for (Entry<String, double[]> doc : docs.entrySet()) {
                if (Arrays.equals(doc.getValue(), vector.data)) {
                    DocVec docVec = new DocVec(doc.getKey(), doc.getValue());
                    cluster.add(docVec);
                }
            }
        }
        clusterList.add(cluster);
    }
    clusterList.updateCentroids();
    clusterList.updateExemplars();
    return clusterList;
}
Also used : ClusterWithMean(ca.pfv.spmf.patterns.cluster.ClusterWithMean) DistanceCorrelation(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray) DistanceFunction(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction)

Example 4 with DoubleArray

use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.

the class HierarchicalClustering method mergeTheClosestCluster.

/**
 * Merge the two closest clusters in terms of distance.
 * @return true if a merge was done, otherwise false.
 */
private boolean mergeTheClosestCluster() {
    // These variables will contain the two closest clusters that
    // can be merged
    ClusterWithMean clusterToMerge1 = null;
    ClusterWithMean clusterToMerge2 = null;
    double minClusterDistance = Integer.MAX_VALUE;
    // by comparing all pairs of clusters i and j
    for (int i = 0; i < clusters.size(); i++) {
        for (int j = i + 1; j < clusters.size(); j++) {
            // calculate the distance between i and j
            double distance = distanceFunction.calculateDistance(clusters.get(i).getmean(), clusters.get(j).getmean());
            // and if it is the smallest distance until now
            if (distance < minClusterDistance && distance <= maxDistance) {
                // record this pair of clusters
                minClusterDistance = distance;
                clusterToMerge1 = clusters.get(i);
                clusterToMerge2 = clusters.get(j);
            }
        }
    }
    // if no close clusters were found, return false
    if (clusterToMerge1 == null) {
        return false;
    }
    // else, merge the two closest clusters
    for (DoubleArray vector : clusterToMerge2.getVectors()) {
        clusterToMerge1.addVector(vector);
    }
    // after mergint, we need to recompute the mean of the resulting cluster
    clusterToMerge1.recomputeClusterMean();
    // we delete the cluster that was merged
    clusters.remove(clusterToMerge2);
    // increase iteration count for statistics
    iterationCount++;
    return true;
}
Also used : ClusterWithMean(ca.pfv.spmf.patterns.cluster.ClusterWithMean) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray)

Example 5 with DoubleArray

use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.

the class HierarchicalClustering method runAlgorithm.

public List<ClusterWithMean> runAlgorithm(HashMap<String, double[]> docs, double maxDistance, DistanceFunction distanceFunction) throws NumberFormatException, IOException {
    startTimestamp = System.currentTimeMillis();
    this.maxDistance = maxDistance;
    this.distanceFunction = distanceFunction;
    // create an empty list of clusters
    clusters = new ArrayList<ClusterWithMean>();
    /**
     **
     * Add each vector to an individual cluster.
     ***
     */
    for (Entry<String, double[]> doc : docs.entrySet()) {
        double[] vector = doc.getValue();
        // create a DoubleArray object with the vector
        DoubleArray theVector = new DoubleArray(vector);
        // Initiallly we create a cluster for each vector
        ClusterWithMean cluster = new ClusterWithMean(vector.length);
        cluster.addVector(theVector);
        cluster.setMean(theVector.clone());
        clusters.add(cluster);
    }
    // (2) Loop to combine the two closest clusters into a bigger cluster
    // until no clusters can be combined.
    boolean changed = false;
    do {
        // merge the two closest clusters
        changed = mergeTheClosestCluster();
        // record memory usage
        MemoryLogger.getInstance().checkMemory();
    } while (changed);
    // record end time
    endTimestamp = System.currentTimeMillis();
    // return the clusters
    return clusters;
}
Also used : ClusterWithMean(ca.pfv.spmf.patterns.cluster.ClusterWithMean) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray)

Aggregations

DoubleArray (ca.pfv.spmf.patterns.cluster.DoubleArray)5 ClusterWithMean (ca.pfv.spmf.patterns.cluster.ClusterWithMean)3 DistanceCorrelation (ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation)2 DistanceFunction (ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction)2 HashMap (java.util.HashMap)2 LEGATO (legato.LEGATO)2 Model (org.apache.jena.rdf.model.Model)2 Resource (org.apache.jena.rdf.model.Resource)2 AlgoDBSCAN (ca.pfv.spmf.algorithms.clustering.dbscan.AlgoDBSCAN)1 Cluster (ca.pfv.spmf.patterns.cluster.Cluster)1 AlignmentParser (fr.inrialpes.exmo.align.parser.AlignmentParser)1 File (java.io.File)1 ArrayList (java.util.ArrayList)1 Iterator (java.util.Iterator)1 Entry (java.util.Map.Entry)1 TreeMap (java.util.TreeMap)1 Cluster (legato.cluster.Cluster)1 ClusterList (legato.cluster.ClusterList)1 DocVector (legato.indexer.DocVector)1 VectorGenerator (legato.indexer.VectorGenerator)1