Search in sources :

Example 1 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class Matchifier method match.

public void match() throws Exception {
    LEGATO legato = LEGATO.getInstance();
    // Final links
    MapList mapList = new MapList();
    // Recall++
    MapList mapList1 = new MapList();
    // Precision++
    MapList mapList2 = new MapList();
    /**
     *******
     * Getting vectors from "Source" and "Target" datasets
     ********
     */
    VectorGenerator vectorGenerator = new VectorGenerator();
    vectorGenerator.GetAllTerms();
    // List of "Source" and "Target" TF-IDF vectors
    DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
    // List of "Source" TF-IDF vectors with their "docName"
    HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
    // List of "Target" TF-IDF vectors with their "docName"
    HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
    for (// Identify "Source" and "Target" vectors
    DocVector doc : // Identify "Source" and "Target" vectors
    docVectors) {
        double[] vector = doc.getVector();
        if (doc.parentFolder.equals("source"))
            srcMap.put(doc.docName, vector);
        else if (doc.parentFolder.equals("target"))
            tgtMap.put(doc.docName, vector);
    }
    /**
     ******
     * "Hierarchical Clustering" on "Source" and "Target" datasets.
     * For each pair --> apply RANkey and link instances based on the
     * best key.
     *******
     */
    // List of "Source" clusters
    ClusterList srcClS = Clustering.getClusters(srcMap);
    // List of "target" clusters
    ClusterList tgtClS = Clustering.getClusters(tgtMap);
    /**
     ******
     * RANKey
     *******
     */
    int pairNumber = 0;
    // All pairs of clusters will be contained in folder "clusters"
    File dir = new File(legato.getPath() + File.separator + "clusters");
    dir.mkdirs();
    DistanceFunction distanceFunction = new DistanceCorrelation();
    for (// For each cluster from "Source" dataset
    Cluster clust1 : // For each cluster from "Source" dataset
    srcClS) {
        for (// For each cluster from "Target" dataset
        Cluster clust2 : // For each cluster from "Target" dataset
        tgtClS) {
            DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
            DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
            if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
                if (// If both clusters contain more than one instance
                clust1.size() > 1 && clust2.size() > 1) {
                    pairNumber = pairNumber + 1;
                    // Both clusters will be contained in folder "pairNumber"
                    File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
                    dirClusters.mkdirs();
                    /**
                     *****
                     * Retrieve RDF Model from the 2 clusters
                     ******
                     */
                    Model srcModel = ModelManager.loadModel(legato.src.toString());
                    Model model1 = ModelFactory.createDefaultModel();
                    String[] resources = clust1.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getSrcURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model1.add(CBDBuilder.getCBD(srcModel, resource));
                    }
                    Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
                    Model model2 = ModelFactory.createDefaultModel();
                    resources = clust2.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getTgtURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model2.add(CBDBuilder.getCBD(tgtModel, resource));
                    }
                    /**
                     **********
                     * Execute RANKey
                     ***********
                     */
                    HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
                    if (!(bestKey == null)) {
                        /**
                         **********
                         * Execute SILK
                         ***********
                         */
                        SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
                        SILK.link(dirClusters.toString());
                        File file = new File(dirClusters.toString() + File.separator + "links.rdf");
                        AlignmentParser aparser = new AlignmentParser(0);
                        Alignment links = aparser.parse(file.toURI());
                        for (Cell cell : links) {
                            mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
                        }
                    }
                } else if (clust1.size() == 1 && clust2.size() == 1) {
                // mapList2.add("http://data.doremus.org/expression/"+clust1.getExemplar().getID(), "http://data.doremus.org/expression/"+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
                }
            }
        }
    }
    /**
     ***
     * Comparison
     ****
     */
    System.out.println("comparison");
    for (int i = 0; i < docVectors.length; i++) {
        DocVector srcDoc = docVectors[i];
        String tgtDoc = null;
        double simVal = 0;
        for (int j = 0; j < docVectors.length; j++) {
            if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
                if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
                    tgtDoc = docVectors[j].docName;
                    simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
                }
            }
        }
        if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
            /*	Model srcModel = ModelManager.loadModel(legato.src.toString());
	    		Model model1 = ModelFactory.createDefaultModel();
	    		Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
	    		String str1 = legato.getType(rsrce1, srcModel).toString();
	    		
	    		Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
	    		Model model2 = ModelFactory.createDefaultModel();
	    		Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
	    		String str2 = legato.getType(rsrce2, tgtModel).toString();
	    				
	    		if (str1.equals(str2)) */
            mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
        }
    }
    /**
     ***********
     * Link repairing
     ***********
     */
    for (Map map1 : mapList1) {
        boolean exist = false;
        for (Map map2 : mapList2) {
            if (map1.getSourceURI().equals(map2.getSourceURI())) {
                if (map1.getTargetURI().equals(map2.getTargetURI()))
                    System.out.println("OUI");
                else
                    System.out.println("NON");
                exist = true;
                mapList.add(map2);
            }
        }
        if (exist == false)
            mapList.add(map1);
    }
    for (Map map2 : mapList2) {
        boolean exist = false;
        for (Map map1 : mapList1) {
            if (map2.getSourceURI().equals(map1.getSourceURI())) {
                exist = true;
            }
        }
        if (exist == false) {
            System.out.println("+1");
            mapList.add(map2);
        }
    }
    /**
     *******
     ** Create and save the alignment file
     ********
     */
    File dirr = new File(legato.getPath() + File.separator + "docs");
    delete(dirr);
    File dirind = new File(legato.getPath() + File.separator + "index");
    delete(dirind);
    File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
    srcFile.deleteOnExit();
    File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
    tgtFile.deleteOnExit();
    File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
    txtFile.deleteOnExit();
    Align.saveMappings(mapList);
}
Also used : ClusterList(legato.cluster.ClusterList) HashMap(java.util.HashMap) Alignment(org.semanticweb.owl.align.Alignment) LEGATO(legato.LEGATO) Cell(org.semanticweb.owl.align.Cell) Resource(org.apache.jena.rdf.model.Resource) Cluster(legato.cluster.Cluster) DocVector(legato.indexer.DocVector) AlignmentParser(fr.inrialpes.exmo.align.parser.AlignmentParser) DistanceFunction(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction) Model(org.apache.jena.rdf.model.Model) VectorGenerator(legato.indexer.VectorGenerator) File(java.io.File) DistanceCorrelation(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray) HashMap(java.util.HashMap)

Example 2 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class ModelManager method rewrite.

/**
 ********
 ** Place all Literals (in resources CBD) to a distance = 1
 *********
 */
public static Model rewrite(Model model, boolean ok) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    Model finalModel = ModelFactory.createDefaultModel();
    model.listSubjects().toSet().forEach((resource) -> {
        // Parse all resources
        if (// If the current resource belongs to a given "type"
        legato.hasType(resource) == true) {
            Model m = CBDBuilder.getCBD(model, resource);
            if (ok == true) {
                m.add(CBDBuilder.getCBDDirectPredecessors(model, resource));
                m.add(CBDBuilder.getCBDDirectSuccessors(model, resource));
            }
            try {
                m.add(ModelManager.parseCBD(m));
            } catch (IOException e1) {
                e1.printStackTrace();
            }
            m.listStatements().toSet().forEach((stmt) -> {
                Resource sub = stmt.getSubject();
                Property prop = stmt.getPredicate();
                RDFNode object = stmt.getObject();
                if (// Parse all literals
                object.isLiteral() == true) {
                    // A filter which accepts statements whose predicate matches one of a collection of predicates held by the filter object.
                    Path path = OntTools.findShortestPath(m, resource, object, Filter.any);
                    if (!(path == null)) {
                        // Get the successive properties from the path
                        List<Property> properties = getPropFromPath(path);
                        if (legato.getPropList().existProperty(properties) == false) {
                            int indice = legato.getPropList().size();
                            finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty("http://model.org/property" + indice), object);
                            try {
                                legato.addToPropList("http://model.org/property" + indice, properties);
                            } catch (IOException e) {
                            }
                        } else {
                            finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty(legato.getPropList().getPropertyName(properties)), object);
                        }
                    } else {
                        String sparqlQueryString = "select ?predec where {" + "?predec ?prop <" + resource + ">." + "}";
                        Query query = QueryFactory.create(sparqlQueryString);
                        QueryExecution qexec = QueryExecutionFactory.create(query, model);
                        ResultSet queryResults = qexec.execSelect();
                        while (queryResults.hasNext()) {
                            QuerySolution qs = queryResults.nextSolution();
                            final PathManager.Path path2 = PathManager.findShortestPath(model, qs.getResource("?predec"), object, prop);
                            if (!(path2 == null)) {
                                // Get the successive properties from the path
                                List<Property> properties = getPropFromPath(path2);
                                if (legato.getPropList().existProperty(properties) == false) {
                                    int indice = legato.getPropList().size();
                                    finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty("http://model.org/property" + indice), object);
                                    try {
                                        legato.addToPropList("http://model.org/property" + indice, properties);
                                    } catch (IOException e) {
                                    }
                                } else {
                                    finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty(legato.getPropList().getPropertyName(properties)), object);
                                }
                            }
                        }
                        qexec.close();
                    }
                } else if (prop.equals(RDF.type) && (legato.hasType(sub))) {
                    finalModel.createResource(resource.toString()).addProperty(RDF.type, object);
                }
            // else
            // finalModel.createResource(resource.toString()).addProperty(prop, object);
            });
        }
    });
    return finalModel;
}
Also used : Path(org.apache.jena.ontology.OntTools.Path) Query(org.apache.jena.query.Query) Resource(org.apache.jena.rdf.model.Resource) IOException(java.io.IOException) QueryExecution(org.apache.jena.query.QueryExecution) LEGATO(legato.LEGATO) QuerySolution(org.apache.jena.query.QuerySolution) OntModel(org.apache.jena.ontology.OntModel) Model(org.apache.jena.rdf.model.Model) ResultSet(org.apache.jena.query.ResultSet) Property(org.apache.jena.rdf.model.Property) RDFNode(org.apache.jena.rdf.model.RDFNode)

Example 3 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class PropertyHandler method clean.

/**
 *****
 * This class deletes problematic properties
 ******
 */
public static void clean(String srcPath, String tgtPath) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    Model srcModel = ModelManager.loadModel(srcPath);
    Model tgtModel = ModelManager.loadModel(tgtPath);
    Model s = ModelFactory.createDefaultModel();
    Model t = ModelFactory.createDefaultModel();
    s = ModelManager.rewrite(srcModel, false);
    t = ModelManager.rewrite(tgtModel, false);
    Model mergedModel = ModelFactory.createDefaultModel();
    mergedModel.add(s);
    mergedModel.add(t);
    List<Resource> properties = getDistinctProperties(mergedModel);
    System.out.println(legato.getPropList());
    HashMap<String, String> propScoreList = new HashMap<String, String>();
    properties.forEach((property) -> {
        propScoreList.put(property.toString(), String.valueOf(getScore(property, mergedModel)));
    });
    ValueComparator<String> comp = new ValueComparator<String>(propScoreList);
    TreeMap<String, String> mapTriee = new TreeMap<String, String>(comp);
    mapTriee.putAll(propScoreList);
    System.out.println(mapTriee);
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < mapTriee.entrySet().size(); i++) {
        sb.append(Double.valueOf((String) mapTriee.values().toArray()[i]) + "\n");
    }
    ;
    FileManager.create("nom", sb.toString().trim());
    int minPts = 1;
    double epsilon = 5d;
    AlgoDBSCAN algo = new AlgoDBSCAN();
    List<Cluster> clusters = algo.runAlgorithm(legato.getPath() + File.separator + "nom.txt", minPts, epsilon, "\n");
    algo.printStatistics();
    double highMean = 0;
    double[] heterCluster = null;
    for (Cluster cluster : clusters) {
        double[] arr = new double[cluster.getVectors().size()];
        int i = 0;
        for (DoubleArray dataPoint : cluster.getVectors()) {
            arr[i++] = dataPoint.data[0];
        }
        A a = new A(arr);
        if (highMean < a.getMean()) {
            highMean = a.getMean();
            heterCluster = arr;
        }
        ;
    }
    List<String> propList = new ArrayList<String>();
    Iterator it = mapTriee.entrySet().iterator();
    while (it.hasNext()) {
        Entry<String, String> entry = (Entry<String, String>) it.next();
        boolean f = false;
        for (int i = 0; i < heterCluster.length; i++) {
            if (String.valueOf(heterCluster[i]).equals(entry.getValue()))
                propList.add(entry.getKey());
            ;
        }
    }
    System.out.println(propList);
    srcModel = ModelManager.rewrite(srcModel, true);
    System.out.println("source");
    tgtModel = ModelManager.rewrite(tgtModel, true);
    Model srcFinalModel = ModelFactory.createDefaultModel();
    srcModel.listStatements().toSet().forEach((stmt) -> {
        Property property = stmt.getPredicate();
        if (!(propList.contains(property.toString()))) {
            srcFinalModel.add(stmt);
        }
    });
    Model tgtFinalModel = ModelFactory.createDefaultModel();
    tgtModel.listStatements().toSet().forEach((stmt) -> {
        Property property = stmt.getPredicate();
        if (!propList.contains(property.toString())) {
            tgtFinalModel.add(stmt);
        }
    });
    // FileManager.createRDFile(new File(legato.getPath()+"store"), "source", srcFinalModel, "TTL");
    // FileManager.createRDFile(new File(legato.getPath()+"store"), "target", tgtFinalModel, "TTL");
    legato.setSource(FileManager.getCreatedRDFile("source", srcFinalModel));
    legato.setTarget(FileManager.getCreatedRDFile("target", tgtFinalModel));
    System.out.println("finish");
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Entry(java.util.Map.Entry) LEGATO(legato.LEGATO) Iterator(java.util.Iterator) Property(org.apache.jena.rdf.model.Property) Resource(org.apache.jena.rdf.model.Resource) Cluster(ca.pfv.spmf.patterns.cluster.Cluster) TreeMap(java.util.TreeMap) AlgoDBSCAN(ca.pfv.spmf.algorithms.clustering.dbscan.AlgoDBSCAN) Model(org.apache.jena.rdf.model.Model) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray)

Example 4 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class DocumentBuilder method getDocuments.

/**
 **********************************************************
 * Build documents for resources based on selected properties
 ***********************************************************
 */
public static HashMap<String, String> getDocuments(String pathFile, List<String> classResources, List<String> selectedProp, String dataset) throws Exception {
    LEGATO legato = LEGATO.getInstance();
    /**
     **
     * Load RDF model from the dataset
     ***
     */
    File f = new File(pathFile);
    Model modelSource = ModelManager.loadModel(pathFile);
    // 1st String = the docName. 2d String = its content
    HashMap<String, String> documents = new HashMap<String, String>();
    /**
     **
     * Documents creation based on the selected properties for each resource
     ***
     */
    for (Resource resource : CBDBuilder.getResources(modelSource, classResources)) {
        Model model = ModelFactory.createDefaultModel();
        String sparqlQueryString = "SELECT DISTINCT ?p ?o {<" + resource + "> ?p ?o }";
        Query query = QueryFactory.create(sparqlQueryString);
        QueryExecution qexec = QueryExecutionFactory.create(query, modelSource);
        ResultSet queryResults = qexec.execSelect();
        while (queryResults.hasNext()) {
            QuerySolution qs = queryResults.nextSolution();
            Resource prop = qs.getResource("?p");
            if (selectedProp.contains(prop.toString())) {
                model.createResource(resource).addProperty(model.createProperty(prop.toString()), qs.get("?o").toString());
            }
        }
        qexec.close();
        String docName = generateUUID(resource.getURI());
        /**
         ***
         * Preprocessing before documents creation
         ****
         */
        String docContent = StopWords.clean(CBDBuilder.getLiterals(model));
        // docContent = Stemmer.stem(docContent);
        if (!docContent.equals("") && !docContent.equals(null) && !docContent.equals("\n") && !docContent.equals(" ")) {
            if (dataset.equals("source"))
                legato.setSrcUri(docName, resource.getURI());
            else if (dataset.equals("target"))
                legato.setTgtUri(docName, resource.getURI());
            // Construct a document for each resource
            documents.put(docName, docContent);
            FileManager.create(docName, docContent, dataset);
        }
    }
    return documents;
}
Also used : LEGATO(legato.LEGATO) Query(org.apache.jena.query.Query) HashMap(java.util.HashMap) QuerySolution(org.apache.jena.query.QuerySolution) Model(org.apache.jena.rdf.model.Model) Resource(org.apache.jena.rdf.model.Resource) ResultSet(org.apache.jena.query.ResultSet) File(java.io.File) QueryExecution(org.apache.jena.query.QueryExecution)

Example 5 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class FileManager method createRDFile.

/**
 *******************
 * Create an RDF file
 ********************
 */
public static void createRDFile(File dirCluster, String fileName, Model model, String ext) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    FileWriter out = new FileWriter(dirCluster.getAbsolutePath() + File.separator + fileName + "." + ext);
    try {
        if (ext.equals("nt"))
            model.write(out, "N-TRIPLES");
        else
            model.write(out, "TTL");
    } finally {
        try {
            out.close();
        } catch (IOException closeException) {
        }
    }
}
Also used : LEGATO(legato.LEGATO) FileWriter(java.io.FileWriter) IOException(java.io.IOException)

Aggregations

LEGATO (legato.LEGATO)14 File (java.io.File)7 Resource (org.apache.jena.rdf.model.Resource)7 Model (org.apache.jena.rdf.model.Model)6 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 QueryExecution (org.apache.jena.query.QueryExecution)4 FileWriter (java.io.FileWriter)3 Iterator (java.util.Iterator)3 Query (org.apache.jena.query.Query)3 QuerySolution (org.apache.jena.query.QuerySolution)3 ResultSet (org.apache.jena.query.ResultSet)3 Property (org.apache.jena.rdf.model.Property)3 DoubleArray (ca.pfv.spmf.patterns.cluster.DoubleArray)2 AlignmentParser (fr.inrialpes.exmo.align.parser.AlignmentParser)2 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 TreeMap (java.util.TreeMap)2 Key (legato.keys.def.Key)2