Search in sources :

Example 1 with DistanceCorrelation

use of ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation in project legato by DOREMUS-ANR.

the class Matchifier method match.

public void match() throws Exception {
    LEGATO legato = LEGATO.getInstance();
    // Final links
    MapList mapList = new MapList();
    // Recall++
    MapList mapList1 = new MapList();
    // Precision++
    MapList mapList2 = new MapList();
    /**
     *******
     * Getting vectors from "Source" and "Target" datasets
     ********
     */
    VectorGenerator vectorGenerator = new VectorGenerator();
    vectorGenerator.GetAllTerms();
    // List of "Source" and "Target" TF-IDF vectors
    DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
    // List of "Source" TF-IDF vectors with their "docName"
    HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
    // List of "Target" TF-IDF vectors with their "docName"
    HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
    for (// Identify "Source" and "Target" vectors
    DocVector doc : // Identify "Source" and "Target" vectors
    docVectors) {
        double[] vector = doc.getVector();
        if (doc.parentFolder.equals("source"))
            srcMap.put(doc.docName, vector);
        else if (doc.parentFolder.equals("target"))
            tgtMap.put(doc.docName, vector);
    }
    /**
     ******
     * "Hierarchical Clustering" on "Source" and "Target" datasets.
     * For each pair --> apply RANkey and link instances based on the
     * best key.
     *******
     */
    // List of "Source" clusters
    ClusterList srcClS = Clustering.getClusters(srcMap);
    // List of "target" clusters
    ClusterList tgtClS = Clustering.getClusters(tgtMap);
    /**
     ******
     * RANKey
     *******
     */
    int pairNumber = 0;
    // All pairs of clusters will be contained in folder "clusters"
    File dir = new File(legato.getPath() + File.separator + "clusters");
    dir.mkdirs();
    DistanceFunction distanceFunction = new DistanceCorrelation();
    for (// For each cluster from "Source" dataset
    Cluster clust1 : // For each cluster from "Source" dataset
    srcClS) {
        for (// For each cluster from "Target" dataset
        Cluster clust2 : // For each cluster from "Target" dataset
        tgtClS) {
            DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
            DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
            if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
                if (// If both clusters contain more than one instance
                clust1.size() > 1 && clust2.size() > 1) {
                    pairNumber = pairNumber + 1;
                    // Both clusters will be contained in folder "pairNumber"
                    File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
                    dirClusters.mkdirs();
                    /**
                     *****
                     * Retrieve RDF Model from the 2 clusters
                     ******
                     */
                    Model srcModel = ModelManager.loadModel(legato.src.toString());
                    Model model1 = ModelFactory.createDefaultModel();
                    String[] resources = clust1.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getSrcURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model1.add(CBDBuilder.getCBD(srcModel, resource));
                    }
                    Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
                    Model model2 = ModelFactory.createDefaultModel();
                    resources = clust2.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getTgtURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model2.add(CBDBuilder.getCBD(tgtModel, resource));
                    }
                    /**
                     **********
                     * Execute RANKey
                     ***********
                     */
                    HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
                    if (!(bestKey == null)) {
                        /**
                         **********
                         * Execute SILK
                         ***********
                         */
                        SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
                        SILK.link(dirClusters.toString());
                        File file = new File(dirClusters.toString() + File.separator + "links.rdf");
                        AlignmentParser aparser = new AlignmentParser(0);
                        Alignment links = aparser.parse(file.toURI());
                        for (Cell cell : links) {
                            mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
                        }
                    }
                } else if (clust1.size() == 1 && clust2.size() == 1) {
                // mapList2.add("http://data.doremus.org/expression/"+clust1.getExemplar().getID(), "http://data.doremus.org/expression/"+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
                }
            }
        }
    }
    /**
     ***
     * Comparison
     ****
     */
    System.out.println("comparison");
    for (int i = 0; i < docVectors.length; i++) {
        DocVector srcDoc = docVectors[i];
        String tgtDoc = null;
        double simVal = 0;
        for (int j = 0; j < docVectors.length; j++) {
            if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
                if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
                    tgtDoc = docVectors[j].docName;
                    simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
                }
            }
        }
        if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
            /*	Model srcModel = ModelManager.loadModel(legato.src.toString());
	    		Model model1 = ModelFactory.createDefaultModel();
	    		Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
	    		String str1 = legato.getType(rsrce1, srcModel).toString();
	    		
	    		Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
	    		Model model2 = ModelFactory.createDefaultModel();
	    		Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
	    		String str2 = legato.getType(rsrce2, tgtModel).toString();
	    				
	    		if (str1.equals(str2)) */
            mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
        }
    }
    /**
     ***********
     * Link repairing
     ***********
     */
    for (Map map1 : mapList1) {
        boolean exist = false;
        for (Map map2 : mapList2) {
            if (map1.getSourceURI().equals(map2.getSourceURI())) {
                if (map1.getTargetURI().equals(map2.getTargetURI()))
                    System.out.println("OUI");
                else
                    System.out.println("NON");
                exist = true;
                mapList.add(map2);
            }
        }
        if (exist == false)
            mapList.add(map1);
    }
    for (Map map2 : mapList2) {
        boolean exist = false;
        for (Map map1 : mapList1) {
            if (map2.getSourceURI().equals(map1.getSourceURI())) {
                exist = true;
            }
        }
        if (exist == false) {
            System.out.println("+1");
            mapList.add(map2);
        }
    }
    /**
     *******
     ** Create and save the alignment file
     ********
     */
    File dirr = new File(legato.getPath() + File.separator + "docs");
    delete(dirr);
    File dirind = new File(legato.getPath() + File.separator + "index");
    delete(dirind);
    File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
    srcFile.deleteOnExit();
    File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
    tgtFile.deleteOnExit();
    File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
    txtFile.deleteOnExit();
    Align.saveMappings(mapList);
}
Also used : ClusterList(legato.cluster.ClusterList) HashMap(java.util.HashMap) Alignment(org.semanticweb.owl.align.Alignment) LEGATO(legato.LEGATO) Cell(org.semanticweb.owl.align.Cell) Resource(org.apache.jena.rdf.model.Resource) Cluster(legato.cluster.Cluster) DocVector(legato.indexer.DocVector) AlignmentParser(fr.inrialpes.exmo.align.parser.AlignmentParser) DistanceFunction(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction) Model(org.apache.jena.rdf.model.Model) VectorGenerator(legato.indexer.VectorGenerator) File(java.io.File) DistanceCorrelation(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray) HashMap(java.util.HashMap)

Example 2 with DistanceCorrelation

use of ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation in project legato by DOREMUS-ANR.

the class Clustering method getClusters.

public static ClusterList getClusters(HashMap<String, double[]> docs) throws NumberFormatException, IOException {
    // double maxdistance = 0.415; //meilleur seuil sur DS_SM
    double maxdistance = 0.2;
    DistanceFunction distanceFunction = new DistanceCorrelation();
    HierarchicalClustering algo = new HierarchicalClustering();
    List<ClusterWithMean> clusters = algo.runAlgorithm(docs, maxdistance, distanceFunction);
    ClusterList clusterList = new ClusterList();
    for (// For each cluster
    ClusterWithMean clust : // For each cluster
    algo.clusters) {
        Cluster cluster = new Cluster();
        for (// For each vector
        DoubleArray vector : // For each vector
        clust.getVectors()) {
            for (Entry<String, double[]> doc : docs.entrySet()) {
                if (Arrays.equals(doc.getValue(), vector.data)) {
                    DocVec docVec = new DocVec(doc.getKey(), doc.getValue());
                    cluster.add(docVec);
                }
            }
        }
        clusterList.add(cluster);
    }
    clusterList.updateCentroids();
    clusterList.updateExemplars();
    return clusterList;
}
Also used : ClusterWithMean(ca.pfv.spmf.patterns.cluster.ClusterWithMean) DistanceCorrelation(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray) DistanceFunction(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction)

Aggregations

DistanceCorrelation (ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation)2 DistanceFunction (ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction)2 DoubleArray (ca.pfv.spmf.patterns.cluster.DoubleArray)2 ClusterWithMean (ca.pfv.spmf.patterns.cluster.ClusterWithMean)1 AlignmentParser (fr.inrialpes.exmo.align.parser.AlignmentParser)1 File (java.io.File)1 HashMap (java.util.HashMap)1 LEGATO (legato.LEGATO)1 Cluster (legato.cluster.Cluster)1 ClusterList (legato.cluster.ClusterList)1 DocVector (legato.indexer.DocVector)1 VectorGenerator (legato.indexer.VectorGenerator)1 Model (org.apache.jena.rdf.model.Model)1 Resource (org.apache.jena.rdf.model.Resource)1 Alignment (org.semanticweb.owl.align.Alignment)1 Cell (org.semanticweb.owl.align.Cell)1